Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0

Fix import paddle.v2.plot to import paddle.utils

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0
Fix import paddle.v2.plot to import paddle.utils
903e3f3f · shippingwang · 77c8ddb5 · 3b7e20b0 · 903e3f3f · 903e3f3f
106 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
+    python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN easy_install -U pip && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
-RUN pip install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install pre-commit 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
+    pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python
 #For docstring checker
+RUN pip3 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
+RUN pip3 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
        /usr/local/opt/openblas/lib)
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,8 +102,8 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
+                    current_op._rename_input(input_arg,
-                                            self.input_map[input_arg])
+                                             self.input_map[input_arg])
    def _remove_unused_var(self):
        '''
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
@@ -66,7 +49,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None))
+paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
 paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
@@ -79,14 +62,14 @@ paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label',
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
-paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -162,21 +145,31 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None))
+paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None))
-paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
 paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
-paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
+paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
 paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
 paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
-paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -239,16 +232,6 @@ paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'],
 paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
 paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
-paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -282,11 +265,11 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
 paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
-paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -321,7 +304,7 @@ paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=[
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
@@ -330,11 +313,11 @@ paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpo
 paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
-paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False))
+paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
 paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
-paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False))
+paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
 paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -167,15 +167,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
-# disable test temporarily.
-# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
-# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-#         conditional_block_op while_op assign_op print_op executor proto_desc)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stddef.h>            // for size_t
-#include <condition_variable>  // NOLINT
-#include <typeindex>
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-enum class ChannelAction {
-  SEND = 0,
-  RECEIVE = 1,
-  CLOSE = 2,
-};
-// Channel is the abstract class of buffered and un-buffered channels.
-template <typename T>
-class Channel {
- public:
-  virtual bool CanSend() = 0;
-  virtual bool CanReceive() = 0;
-  virtual void Send(T*) = 0;
-  virtual bool Receive(T*) = 0;
-  virtual size_t Cap() = 0;
-  virtual void Lock() = 0;
-  virtual void Unlock() = 0;
-  virtual bool IsClosed() = 0;
-  virtual void Close() = 0;
-  virtual ~Channel() {}
-  virtual void AddToSendQ(const void* referrer, T* data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb) = 0;
-  virtual void AddToReceiveQ(const void* referrer, T* data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb) = 0;
-  virtual void RemoveFromSendQ(const void* referrer) = 0;
-  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-};
-// Forward declaration of channel implementations.
-template <typename T>
-class ChannelImpl;
-template <typename T>
-Channel<T>* MakeChannel(size_t buffer_size) {
-  return new ChannelImpl<T>(buffer_size);
-}
-template <typename T>
-void CloseChannel(Channel<T>* ch) {
-  ch->Close();
-}
-/*
- * The ChannelHolder class serves two main purposes:
- * 1. It acts as a unified wrapper for the different kinds of
- *    channels, i.e. Buffered and Unbuffered channels. This is
- *    similar to the ReaderHolder class.
- * 2. It also helps us in TypeHiding. This is similar to the
- *    PlaceHolder implementations in variable.h and tensor.h.
- */
-class ChannelHolder {
- public:
-  template <typename T>
-  void Reset(size_t buffer_size) {
-    holder_.reset(new PlaceholderImpl<T>(buffer_size));
-  }
-  template <typename T>
-  void Send(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    // Static cast should be safe because we have ensured that types are same
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    channel->Send(data);
-  }
-  template <typename T>
-  bool Receive(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    return channel->Receive(data);
-  }
-  bool IsClosed() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->IsClosed();
-  }
-  bool CanSend() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanSend();
-  }
-  bool CanReceive() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanReceive();
-  }
-  void close() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Close();
-  }
-  size_t Cap() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Cap();
-  }
-  void Lock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Lock();
-  }
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Unlock();
-  }
-  template <typename T>
-  void AddToSendQ(const void* referrer, T* data,
-                  std::shared_ptr<std::condition_variable_any> cond,
-                  std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToSendQ(referrer, data, cond, cb);
-    }
-  }
-  template <typename T>
-  void AddToReceiveQ(const void* referrer, T* data,
-                     std::shared_ptr<std::condition_variable_any> cond,
-                     std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToReceiveQ(referrer, data, cond, cb);
-    }
-  }
-  void RemoveFromSendQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromSendQ(referrer);
-  }
-  void RemoveFromReceiveQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromReceiveQ(referrer);
-  }
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-  inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Type();
-  }
- private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of ChannelHolder.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_index Type() const = 0;
-    virtual void* Ptr() const = 0;
-    virtual bool IsClosed() = 0;
-    virtual bool CanSend() = 0;
-    virtual bool CanReceive() = 0;
-    virtual void RemoveFromSendQ(const void* referrer) = 0;
-    virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-    virtual void Close() = 0;
-    virtual void Lock() = 0;
-    virtual void Unlock() = 0;
-    virtual size_t Cap() = 0;
-  };
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(size_t buffer_size)
-        : type_(std::type_index(typeid(T))) {
-      channel_.reset(MakeChannel<T>(buffer_size));
-    }
-    virtual const std::type_index Type() const { return type_; }
-    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
-    virtual bool IsClosed() {
-      if (channel_) {
-        return channel_->IsClosed();
-      }
-      return false;
-    }
-    virtual bool CanSend() {
-      if (channel_) {
-        return channel_->CanSend();
-      }
-      return false;
-    }
-    virtual bool CanReceive() {
-      if (channel_) {
-        return channel_->CanReceive();
-      }
-      return false;
-    }
-    virtual void RemoveFromSendQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromSendQ(referrer);
-      }
-    }
-    virtual void RemoveFromReceiveQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromReceiveQ(referrer);
-      }
-    }
-    virtual void Close() {
-      if (channel_) channel_->Close();
-    }
-    virtual size_t Cap() {
-      if (channel_)
-        return channel_->Cap();
-      else
-        return -1;
-    }
-    virtual void Lock() {
-      if (channel_) channel_->Lock();
-    }
-    virtual void Unlock() {
-      if (channel_) channel_->Unlock();
-    }
-    std::unique_ptr<Channel<T>> channel_;
-    const std::type_index type_;
-  };
-  // Pointer to a PlaceholderImpl object
-  std::unique_ptr<Placeholder> holder_;
-};
-}  // namespace framework
-}  // namespace paddle
-#include "paddle/fluid/framework/channel_impl.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stddef.h>  // for size_t
-#include <atomic>
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-template <typename T>
-class ChannelImpl : public paddle::framework::Channel<T> {
-  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
- public:
-  virtual bool CanSend();
-  virtual bool CanReceive();
-  virtual void Send(T *);
-  virtual bool Receive(T *);
-  virtual size_t Cap() { return cap_; }
-  virtual void Lock();
-  virtual void Unlock();
-  virtual bool IsClosed();
-  virtual void Close();
-  explicit ChannelImpl(size_t);
-  virtual ~ChannelImpl();
-  virtual void AddToSendQ(const void *referrer, T *data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb);
-  virtual void AddToReceiveQ(const void *referrer, T *data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb);
-  virtual void RemoveFromSendQ(const void *referrer);
-  virtual void RemoveFromReceiveQ(const void *referrer);
- private:
-  struct QueueMessage {
-    T *data;
-    std::shared_ptr<std::condition_variable_any> cond;
-    bool chan_closed = false;
-    bool completed = false;
-    const void *referrer;  // TODO(thuan): figure out better way to do this
-    std::function<bool(ChannelAction)> callback;
-    explicit QueueMessage(T *item)
-        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
-    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
-        : data(item), cond(cond) {}
-    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond->wait(lock, [this]() { return completed; });
-    }
-    void Notify() {
-      completed = true;
-      cond->notify_all();
-    }
-  };
-  void send_return() {
-    send_ctr--;
-    destructor_cond_.notify_all();
-  }
-  bool recv_return(bool value) {
-    recv_ctr--;
-    destructor_cond_.notify_all();
-    return value;
-  }
-  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
-    while (!queue->empty()) {
-      // Check whether this message was added by Select
-      // If this was added by Select then execute the callback
-      // to check if you can execute this message. The callback
-      // can return false if some other case was executed in Select.
-      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue->front();
-      queue->pop_front();
-      if (m->callback == nullptr || m->callback(action)) return m;
-    }
-    return nullptr;
-  }
-  size_t cap_;
-  std::recursive_mutex mu_;
-  bool closed_;
-  std::deque<T> buf_;
-  std::deque<std::shared_ptr<QueueMessage>> recvq;
-  std::deque<std::shared_ptr<QueueMessage>> sendq;
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  std::condition_variable_any destructor_cond_;
-};
-template <typename T>
-ChannelImpl<T>::ChannelImpl(size_t capacity)
-    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
-  PADDLE_ENFORCE_GE(capacity, 0);
-}
-template <typename T>
-bool ChannelImpl<T>::CanSend() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
-}
-template <typename T>
-bool ChannelImpl<T>::CanReceive() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
-}
-template <typename T>
-void ChannelImpl<T>::Send(T *item) {
-  send_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  // If channel is closed, throw exception
-  if (closed_) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  // If there is a receiver, directly pass the value we want
-  // to send to the receiver, bypassing the channel buffer if any
-  if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&recvq, ChannelAction::SEND);
-    if (m != nullptr) {
-      *(m->data) = std::move(*item);
-      m->Notify();
-      send_return();
-      return;
-    } else {
-      Send(item);
-      send_return();
-      return;
-    }
-  }
-  // Unbuffered channel will always bypass this
-  // If buffered channel has space in buffer,
-  // write the element to the buffer.
-  if (buf_.size() < cap_) {
-    // Copy to buffer
-    buf_.push_back(std::move(*item));
-    send_return();
-    return;
-  }
-  // Block on channel, because some receiver will complete
-  // the operation for us
-  auto m = std::make_shared<QueueMessage>(item);
-  sendq.push_back(m);
-  m->Wait(lock);
-  if (m->chan_closed) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  send_return();
-}
-template <typename T>
-bool ChannelImpl<T>::Receive(T *item) {
-  recv_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  // If channel is closed and buffer is empty or
-  // channel is unbuffered
-  if (closed_ && buf_.empty()) return recv_return(false);
-  // If there is a sender, directly receive the value we want
-  // from the sender. In case of a buffered channel, read from
-  // buffer and move front of send queue to the buffer
-  if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&sendq, ChannelAction::RECEIVE);
-    if (buf_.size() > 0) {
-      // Case 1 : Channel is Buffered
-      // Do Data transfer from front of buffer
-      // and add a QueueMessage to the buffer
-      *item = std::move(buf_.front());
-      buf_.pop_front();
-      // If first message from sendq is not null
-      // add it to the buffer and notify it
-      if (m != nullptr) {
-        // Copy to buffer
-        buf_.push_back(std::move(*(m->data)));
-        m->Notify();
-      }  // Ignore if there is no first message
-    } else {
-      // Case 2: Channel is Unbuffered
-      // Do data transfer from front of SendQ
-      // If front is nullptr, then recursively call itself
-      if (m != nullptr) {
-        *item = std::move(*(m->data));
-        m->Notify();
-      } else {
-        return recv_return(Receive(item));
-      }
-    }
-    return recv_return(true);
-  }
-  // If this is a buffered channel and there are items in buffer
-  if (buf_.size() > 0) {
-    // Directly read from buffer
-    *item = std::move(buf_.front());
-    buf_.pop_front();
-    // return true
-    return recv_return(true);
-  }
-  // No sender available, block on this channel
-  // Some receiver will complete the option for us
-  auto m = std::make_shared<QueueMessage>(item);
-  recvq.push_back(m);
-  m->Wait(lock);
-  return recv_return(!m->chan_closed);
-}
-template <typename T>
-void ChannelImpl<T>::Lock() {
-  mu_.lock();
-}
-template <typename T>
-void ChannelImpl<T>::Unlock() {
-  mu_.unlock();
-}
-template <typename T>
-bool ChannelImpl<T>::IsClosed() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return closed_;
-}
-template <typename T>
-void ChannelImpl<T>::Close() {
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  if (closed_) {
-    // TODO(abhinavarora): closing an already closed channel should panic
-    lock.unlock();
-    return;
-  }
-  closed_ = true;
-  // Empty the readers
-  while (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    m->chan_closed = true;
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-    m->Notify();
-  }
-  // Empty the senders
-  while (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    m->chan_closed = true;
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-    m->Notify();
-  }
-}
-template <typename T>
-void ChannelImpl<T>::AddToSendQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  sendq.push_back(m);
-}
-template <typename T>
-void ChannelImpl<T>::AddToReceiveQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  recvq.push_back(m);
-}
-template <typename T>
-void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  for (auto it = sendq.begin(); it != sendq.end();) {
-    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
-    if (sendMsg->referrer == referrer) {
-      it = sendq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-template <typename T>
-void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  for (auto it = recvq.begin(); it != recvq.end();) {
-    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
-    if (recvMsg->referrer == referrer) {
-      it = recvq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-template <typename T>
-ChannelImpl<T>::~ChannelImpl() {
-  Close();
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  destructor_cond_.wait(lock,
-                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/channel.h"
-#include <chrono>  // NOLINT
-#include <thread>  // NOLINT
-#include "gtest/gtest.h"
-using paddle::framework::Channel;
-using paddle::framework::ChannelHolder;
-using paddle::framework::MakeChannel;
-using paddle::framework::CloseChannel;
-TEST(Channel, ChannelCapacityTest) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  EXPECT_EQ(ch->Cap(), buffer_size);
-  CloseChannel(ch);
-  delete ch;
-  ch = MakeChannel<size_t>(0);
-  EXPECT_EQ(ch->Cap(), 0U);
-  CloseChannel(ch);
-  delete ch;
-}
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < num_items; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  for (int i = 0; i < num_items; i++) {
-    int recv = -1;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  CloseChannel(ch);
-  t.join();
-  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
-  EXPECT_EQ(sum_send, expected_sum);
-  delete ch;
-}
-TEST(Channel, SufficientBufferSizeDoesntBlock) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);
-  }
-  size_t out;
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // should not block
-    EXPECT_EQ(out, i);
-  }
-  CloseChannel(ch);
-  delete ch;
-}
-// This tests that a  channel must return false
-// on send and receive performed after closing the channel.
-// Receive will only return false after close when queue is empty.
-// By creating separate threads for sending and receiving, we make this
-// function able to test both buffered and unbuffered channels.
-void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
-  const size_t data = 5;
-  std::thread send_thread{[&]() {
-    size_t i = data;
-    ch->Send(&i);  // should not block
-  }};
-  std::thread recv_thread{[&]() {
-    size_t i;
-    EXPECT_EQ(ch->Receive(&i), true);  // should not block
-    EXPECT_EQ(i, data);
-  }};
-  send_thread.join();
-  recv_thread.join();
-  // After closing send should panic. Receive should
-  // also  false as there is no data in queue.
-  CloseChannel(ch);
-  send_thread = std::thread{[&]() {
-    size_t i = data;
-    bool is_exception = false;
-    try {
-      ch->Send(&i);
-    } catch (paddle::platform::EnforceNotMet e) {
-      is_exception = true;
-    }
-    EXPECT_EQ(is_exception, true);
-  }};
-  recv_thread = std::thread{[&]() {
-    size_t i;
-    // should return false because channel is closed and queue is empty
-    EXPECT_EQ(ch->Receive(&i), false);
-  }};
-  send_thread.join();
-  recv_thread.join();
-}
-TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
-  size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
-  auto ch = MakeChannel<size_t>(0);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);  // sending should not block
-  }
-  size_t out;
-  for (size_t i = 0; i < buffer_size / 2; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
-    EXPECT_EQ(out, i);
-  }
-  CloseChannel(ch);
-  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              true);  // receving should return residual values.
-    EXPECT_EQ(out, i);
-  }
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              false);  // receiving on closed channel should return false
-  }
-  delete ch;
-}
-TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  std::thread t([&]() {
-    // Try to write more than buffer size.
-    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size) {
-        ch->Send(&i);  // should block after 10 iterations
-      } else {
-        bool is_exception = false;
-        try {
-          ch->Send(&i);
-        } catch (paddle::platform::EnforceNotMet e) {
-          is_exception = true;
-        }
-        EXPECT_EQ(is_exception, true);
-      }
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
-  auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch, 20);
-}
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is less than size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 5);
-}
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is equal to size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 10);
-}
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is greater than the size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 20);
-}
-void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // Explicitly close the channel
-  // This should unblock all receivers
-  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  if (isBuffered) {
-    // If ch is Buffered, atleast 4 threads must be blocked.
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (!thread_ended[i]) ct++;
-    }
-    EXPECT_GE(ct, 4);
-  } else {
-    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly close the thread
-  // This should unblock all senders
-  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  if (isBuffered) {
-    // Verify that only 1 send was successful
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (send_success[i]) ct++;
-    }
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-// This tests that closing a buffered channel also unblocks
-//  any receivers waiting on the channel
-TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-// This tests that closing a buffered channel also unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksSendersTest(ch, true);
-  delete ch;
-}
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any senders waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksSendersTest(ch, false);
-  delete ch;
-}
-TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  // Send should block after three iterations
-  // since we only have three receivers.
-  std::thread t([&]() {
-    // Try to send more number of times
-    // than receivers
-    for (int i = 0; i < 4; i++) {
-      try {
-        ch->Send(&i);
-        sum_send += i;
-      } catch (paddle::platform::EnforceNotMet e) {
-      }
-    }
-  });
-  for (int i = 0; i < 3; i++) {
-    int recv;
-    ch->Receive(&recv);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum_send, 3U);
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  unsigned sum_receive = 0;
-  // The receiver should block after 5
-  // iterations, since there are only 5 senders.
-  std::thread t([&]() {
-    for (int i = 0; i < 8; i++) {
-      int recv;
-      ch->Receive(&recv);  // should block after the fifth iteration.
-      EXPECT_EQ(recv, i);
-      sum_receive += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum_send, 10U);
-  EXPECT_EQ(sum_receive, 10U);
-  // send three more elements
-  for (int i = 5; i < 8; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 28U);
-  EXPECT_EQ(sum_receive, 28U);
-  delete ch;
-}
-// This tests that destroying a channel unblocks
-//  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  if (isBuffered) {
-    // If channel is buffered, verify that atleast 4 threads are blocked
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (thread_ended[i] == false) ct++;
-    }
-    // Atleast 4 threads must be blocked
-    EXPECT_GE(ct, 4);
-  } else {
-    // Verify that all the threads are blocked
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly destroy the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  // Count number of successful sends
-  int ct = 0;
-  for (size_t i = 0; i < kNumThreads; i++) {
-    if (send_success[i]) ct++;
-  }
-  if (isBuffered) {
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  } else {
-    // In unbuffered channel, no send should be successful
-    EXPECT_EQ(ct, 0);
-  }
-  // Join all threads
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-// This tests that destroying a channel also unblocks
-//  any receivers waiting on the channel
-void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // delete the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockReceivers(ch);
-}
-TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch, true);
-}
-// This tests that destroying an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockReceivers(ch);
-}
-TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch, false);
-}
-TEST(ChannelHolder, ChannelHolderCapacityTest) {
-  const size_t buffer_size = 10;
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(buffer_size);
-  EXPECT_EQ(ch->Cap(), buffer_size);
-  delete ch;
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  EXPECT_EQ(ch->Cap(), 0U);
-  delete ch;
-}
-void ChannelHolderSendReceive(ChannelHolder *ch) {
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
-  ch->close();
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-}
-TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(10);
-  ChannelHolderSendReceive(ch);
-  delete ch;
-}
-TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderSendReceive(ch);
-  delete ch;
-}
-TEST(ChannelHolder, ChannelUninitializedTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  EXPECT_EQ(ch->IsInitialized(), false);
-  int i = 10;
-  bool send_exception = false;
-  try {
-    ch->Send(&i);
-  } catch (paddle::platform::EnforceNotMet e) {
-    send_exception = true;
-  }
-  EXPECT_EQ(send_exception, true);
-  bool recv_exception = false;
-  try {
-    ch->Receive(&i);
-  } catch (paddle::platform::EnforceNotMet e) {
-    recv_exception = true;
-  }
-  EXPECT_EQ(recv_exception, true);
-  bool is_exception = false;
-  try {
-    ch->Type();
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-TEST(ChannelHolder, ChannelInitializedTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(2);
-  EXPECT_EQ(ch->IsInitialized(), true);
-  // Channel should remain intialized even after close
-  ch->close();
-  EXPECT_EQ(ch->IsInitialized(), true);
-  delete ch;
-}
-TEST(ChannelHolder, TypeMismatchSendTest) {
-  // Test with unbuffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  bool is_exception = false;
-  bool boolean_data = true;
-  try {
-    ch->Send(&boolean_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-  // Test with Buffered Channel
-  ch = new ChannelHolder();
-  ch->Reset<float>(10);
-  is_exception = false;
-  int int_data = 23;
-  try {
-    ch->Send(&int_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-TEST(ChannelHolder, TypeMismatchReceiveTest) {
-  // Test with unbuffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  bool is_exception = false;
-  bool float_data;
-  try {
-    ch->Receive(&float_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-  // Test with Buffered Channel
-  ch = new ChannelHolder();
-  ch->Reset<float>(10);
-  is_exception = false;
-  int int_data = 23;
-  try {
-    ch->Receive(&int_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // Explicitly close the channel
-  // This should unblock all receivers
-  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  if (isBuffered) {
-    // If ch is Buffered, atleast 4 threads must be blocked.
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (!thread_ended[i]) ct++;
-    }
-    EXPECT_GE(ct, 4);
-  } else {
-    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly close the thread
-  // This should unblock all senders
-  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  if (isBuffered) {
-    // Verify that only 1 send was successful
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (send_success[i]) ct++;
-    }
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-// This tests that closing a channelholder unblocks
-//  any receivers waiting on the channel
-TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
-  // Check for buffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderCloseUnblocksReceiversTest(ch);
-  delete ch;
-  // Check for unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-// This tests that closing a channelholder unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
-  // Check for buffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderCloseUnblocksSendersTest(ch, true);
-  delete ch;
-  // Check for unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderCloseUnblocksSendersTest(ch, false);
-  delete ch;
-}
-// This tests that destroying a channelholder unblocks
-//  any senders waiting for channel
-void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  if (isBuffered) {
-    // If channel is buffered, verify that atleast 4 threads are blocked
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (thread_ended[i] == false) ct++;
-    }
-    // Atleast 4 threads must be blocked
-    EXPECT_GE(ct, 4);
-  } else {
-    // Verify that all the threads are blocked
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly destroy the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  // Count number of successfuld sends
-  int ct = 0;
-  for (size_t i = 0; i < kNumThreads; i++) {
-    if (send_success[i]) ct++;
-  }
-  if (isBuffered) {
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  } else {
-    // In unbuffered channel, no send should be successful
-    EXPECT_EQ(ct, 0);
-  }
-  // Join all threads
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-// This tests that destroying a channelholder also unblocks
-//  any receivers waiting on the channel
-void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // delete the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderDestroyUnblockReceivers(ch);
-  // ch is already deleted already deleted in
-  // ChannelHolderDestroyUnblockReceivers
-  // Check for Unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderDestroyUnblockReceivers(ch);
-}
-TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderDestroyUnblockSenders(ch, true);
-  // ch is already deleted already deleted in
-  // ChannelHolderDestroyUnblockReceivers
-  // Check for Unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderDestroyUnblockSenders(ch, false);
-}
-// This tests that closing a channelholder many times.
-void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int kNumThreads = 15;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < kNumThreads / 3; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended) {
-          int data = 10;
-          ch->Send(&data);
-          *ended = true;
-        },
-        &thread_ended[i]);
-  }
-  // Launches threads that try to receive data to channel.
-  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          if (ch->Receive(&data)) {
-            EXPECT_EQ(data, 10);
-          }
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  // Launches threads that try to close the channel.
-  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          if (!ch->IsClosed()) {
-            ch->close();
-          }
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-  // Verify that all threads are unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  EXPECT_TRUE(ch->IsClosed());
-  // delete the channel
-  delete ch;
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(10);
-  ChannelHolderManyTimesClose(ch);
-}
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thread>  // NOLINT
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-USE_NO_KERNEL_OP(go);
-USE_NO_KERNEL_OP(channel_close);
-USE_NO_KERNEL_OP(channel_create);
-USE_NO_KERNEL_OP(channel_recv);
-USE_NO_KERNEL_OP(channel_send);
-USE_NO_KERNEL_OP(elementwise_add);
-USE_NO_KERNEL_OP(select);
-USE_NO_KERNEL_OP(conditional_block);
-USE_NO_KERNEL_OP(equal);
-USE_NO_KERNEL_OP(assign);
-USE_NO_KERNEL_OP(while);
-USE_NO_KERNEL_OP(print);
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace paddle {
-namespace framework {
-template <typename T>
-LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
-                          std::string name, T value) {
-  // Create LoDTensor<int> of dim [1]
-  auto var = scope->Var(name);
-  auto tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize({1});
-  T *expect = tensor->mutable_data<T>(place);
-  expect[0] = value;
-  return tensor;
-}
-void AddOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, AttributeMap attrs,
-           BlockDesc *block) {
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
-             BlockDesc *casesBlock, int caseId, int caseType,
-             std::string caseChannel, std::string caseVarName,
-             std::function<void(BlockDesc *, Scope *)> func) {
-  std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
-  std::string caseCondXVarName =
-      std::string("caseCondX") + std::to_string(caseId);
-  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
-  func(caseBlock, scope);
-  CreateVariable(scope, *place, caseCondName, false);
-  CreateVariable(scope, *place, caseCondXVarName, caseId);
-  CreateVariable(scope, *place, caseVarName, caseId);
-  scope->Var("step_scope");
-  AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
-        {{"Out", {caseCondName}}}, {}, casesBlock);
-  AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
-        {{"Out", {}}, {"Scope", {"step_scope"}}},
-        {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
-}
-void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
-                        BlockDesc *parentBlock, std::string dataChanName,
-                        std::string quitChanName) {
-  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
-  CreateVariable(scope, *place, "whileExitCond", true);
-  CreateVariable(scope, *place, "caseToExecute", -1);
-  CreateVariable(scope, *place, "case1var", 0);
-  CreateVariable(scope, *place, "xtemp", 0);
-  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
-  // data,
-  // which causes the data to be no longer accessible to do the fib calculation
-  // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(scope, *place, "fibXToSend", 0);
-  CreateVariable(scope, *place, "fibX", 0);
-  CreateVariable(scope, *place, "fibY", 1);
-  CreateVariable(scope, *place, "quitVar", 0);
-  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
-  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
-  // TODO(thuan): Remove this once we change channel send to do a copy instead
-  // of move
-  AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
-  // Case 0: Send to dataChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
-    AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
-    AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
-          {{"Out", {"fibY"}}}, {}, caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
-          case0Func);
-  std::string case0Config =
-      std::string("0,1,") + dataChanName + std::string(",fibXToSend");
-  // Case 1: Receive from quitChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    // Exit the while loop after we receive from quit channel.
-    // We assign a false to "whileExitCond" variable, which will
-    // break out of while_op loop
-    CreateVariable(scope, *place, "whileFalse", false);
-    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
-          caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
-          case2Func);
-  std::string case1Config =
-      std::string("1,2,") + quitChanName + std::string(",quitVar");
-  // Select block
-  AddOp("select", {{"X", {dataChanName, quitChanName}},
-                   {"case_to_execute", {"caseToExecute"}}},
-        {{"Out", {}}},
-        {{"sub_block", casesBlock},
-         {"cases", std::vector<std::string>{case0Config, case1Config}}},
-        whileBlock);
-  scope->Var("stepScopes");
-  AddOp("while",
-        {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
-        {{"Out", {}}, {"StepScopes", {"stepScopes"}}},
-        {{"sub_block", whileBlock}}, parentBlock);
-}
-TEST(Concurrency, Go_Op) {
-  Scope scope;
-  p::CPUPlace place;
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-  // Create channel variable
-  scope.Var("Channel");
-  // Create Variables, x0 will be put into channel,
-  // result will be pulled from channel
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "x0", 99);
-  CreateVariable(&scope, place, "result", 0);
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-  // Create channel OP
-  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
-        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
-        block);
-  // Create Go Op routine
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-  // Create Go Op
-  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
-        block);
-  // Create Channel Receive Op
-  AddOp("channel_recv", {{"Channel", {"Channel"}}},
-        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
-  // Check the result tensor to make sure it is set to 0
-  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
-  auto *initialData = tensor.data<int>();
-  EXPECT_EQ(initialData[0], 0);
-  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, the Go operator should do a channel_send to
-  // set the "result" variable to 99.
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 99);
-}
-/**
- * This test implements the fibonacci function using go_op and select_op
- */
-TEST(Concurrency, Select) {
-  Scope scope;
-  p::CPUPlace place;
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "result", 0);
-  CreateVariable(&scope, place, "currentXFib", 0);
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-  // Create channel OP
-  std::string dataChanName = "Channel";
-  scope.Var(dataChanName);
-  AddOp("channel_create", {}, {{"Out", {dataChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-  std::string quitChanName = "Quit";
-  scope.Var(quitChanName);
-  AddOp("channel_create", {}, {{"Out", {quitChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-  // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(&scope, place, "xReceiveVar", 0);
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  for (int i = 0; i < 10; ++i) {
-    AddOp("channel_recv", {{"Channel", {dataChanName}}},
-          {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
-    AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
-          {{"first_n", 100},
-           {"summarize", -1},
-           {"print_tensor_name", false},
-           {"print_tensor_type", true},
-           {"print_tensor_shape", false},
-           {"print_tensor_lod", false},
-           {"print_phase", std::string("FORWARD")},
-           {"message", std::string("X: ")}},
-          goOpBlock);
-  }
-  CreateVariable(&scope, place, "quitSignal", 0);
-  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-  // Create Go Op
-  AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
-        {{"sub_block", goOpBlock}}, block);
-  AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
-                     quitChanName);
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
-  AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
-  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, "result" variable should be equal to 34
-  // (which is 10 loops through fibonacci sequence)
-  const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 34);
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
    var->GetMutable<platform::PlaceList>();
  } else if (var_type == proto::VarType::READER) {
    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::CHANNEL) {
-    var->GetMutable<ChannelHolder>();
  } else if (var_type == proto::VarType::RAW) {
    // GetMutable will be called in operator
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
        var_type);
  }
 }

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -126,7 +126,6 @@ message VarType {
    LOD_TENSOR_ARRAY = 13;
    PLACE_LIST = 14;
    READER = 15;
-    CHANNEL = 16;
    // Any runtime decided variable type is raw
    // raw variables should manage their own allocations
    // in operators like nccl_op
@@ -158,12 +157,6 @@ message VarType {
  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
  optional ReaderDesc reader = 5;
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
  message Tuple { repeated Type element_type = 1; }
  optional Tuple tuple = 7;
 }

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include <vector>
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));
    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }
 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }
 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"

--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -88,13 +88,7 @@ std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
 }
 void VarDesc::SetDataType(proto::VarType::Type data_type) {
-  switch (desc_.type().type()) {
+  mutable_tensor_desc()->set_data_type(data_type);
-    case proto::VarType::CHANNEL:
-      mutable_channel_desc()->set_data_type(data_type);
-      break;
-    default:
-      mutable_tensor_desc()->set_data_type(data_type);
-  }
 }
 void VarDesc::SetDataTypes(
@@ -115,13 +109,7 @@ void VarDesc::SetDataTypes(
 }
 proto::VarType::Type VarDesc::GetDataType() const {
-  switch (desc_.type().type()) {
+  return tensor_desc().data_type();
-    case proto::VarType::CHANNEL:
-      return channel_desc().data_type();
-      break;
-    default:
-      return tensor_desc().data_type();
-  }
 }
 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
@@ -134,17 +122,6 @@ std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
  return res;
 }
-void VarDesc::SetCapacity(int64_t capacity) {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      desc_.mutable_type()->mutable_channel()->set_capacity(capacity);
-      break;
-    default:
-      PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.",
-                   this->Name());
-  }
-}
 void VarDesc::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type().type()) {
    case proto::VarType::LOD_TENSOR:
@@ -214,19 +191,6 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
  }
 }
-const proto::VarType::ChannelDesc &VarDesc::channel_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.type().channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'channel_desc' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
@@ -262,20 +226,6 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
  }
 }
-proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.mutable_type()->mutable_channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'mutable_channel_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");

--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -87,8 +87,6 @@ class VarDesc {
  void SetDataTypes(
      const std::vector<proto::VarType::Type> &multiple_data_type);
-  void SetCapacity(int64_t capacity);
  proto::VarType::Type GetDataType() const;
  std::vector<proto::VarType::Type> GetDataTypes() const;
@@ -110,10 +108,8 @@ class VarDesc {
  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 private:
-  const proto::VarType::ChannelDesc &channel_desc() const;
  const proto::VarType::TensorDesc &tensor_desc() const;
  std::vector<proto::VarType::TensorDesc> tensor_descs() const;
-  proto::VarType::ChannelDesc *mutable_channel_desc();
  proto::VarType::TensorDesc *mutable_tensor_desc();
  std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();

--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) {
    return proto::VarType_Type_SELECTED_ROWS;
  } else if (IsType<ReaderHolder>(type)) {
    return proto::VarType_Type_READER;
-  } else if (IsType<ChannelHolder>(type)) {
-    return proto::VarType_Type_CHANNEL;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
    case proto::VarType_Type_READER:
      visitor(var.Get<ReaderHolder>());
      return;
-    case proto::VarType_Type_CHANNEL:
-      visitor(var.Get<ChannelHolder>());
-      return;
    default:
      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
  }

--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -41,12 +41,6 @@ class AnalysisPass {
  // all passes have run.
  virtual bool Finalize() { return false; }
-  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
-                                          const std::string &banner) const {
-    return nullptr;
-  }
  // Create a debugger Pass that draw the DFG by graphviz toolkit.
  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }
-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
    LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
+              << static_cast<float*>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
  }
 }
-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                       framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"
@@ -63,8 +64,11 @@ class SubGraphFuse {
 public:
  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}
  // The main method which run all the logic.
  void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
 private:
  DataFlowGraph *graph_;
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
  auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));
  size_t count0 = dfg.nodes.size();
-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
  fuse();
  int count1 = 0;

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
  VLOG(4) << "debug info "
          << graph->HumanReadableInfo(false /*show_values*/,
                                      true /*show_functions*/);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }
  // This class get a sub-graph as input and determine whether to transform this
  // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
  };
  Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -21,6 +21,12 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef __clang__
+#define ACC_DIFF 4e-3
+#else
+#define ACC_DIFF 1e-3
+#endif
 DEFINE_string(dirname, "", "Directory of the inference model.");
 namespace paddle {
@@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) {
  float* lod_data = output1.data<float>();
  for (int i = 0; i < output1.numel(); ++i) {
-    EXPECT_LT(lod_data[i] - data[i], 1e-3);
+    EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
-    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+    EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
  }
 }
@@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) {
  float* data = static_cast<float*>(outputs[0].data.data());
  float* lod_data = output1.data<float>();
  for (size_t j = 0; j < len / sizeof(float); ++j) {
-    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+    EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
  }
 }
@@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
      }
    });
  }
@@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
      }
    });
  }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  void OptimizeInferenceProgram() {
    // Analyze inference_program
    Argument argument;
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
    if (!config_.model_dir.empty()) {
      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
    } else {

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };
 // NOTE WIP, not stable yet.

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -85,3 +85,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
 endif()
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+DEFINE_string(dirname, "", "Directory of the inference model.");
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
+    op_library(reduce_mean_op DEPS cub)
 else()
    op_library(conv_op DEPS vol2col im2col)
 endif()
@@ -313,11 +314,6 @@ op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
-# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
-add_subdirectory(concurrency)
-op_library(channel_send_op DEPS concurrency)
-op_library(channel_recv_op DEPS concurrency)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                      "Out and Label should have same height.");
    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
  }
 protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
    // TODO(typhoonzero): support weight input
    AddInput("StatPos", "Statistic value when label = 1");
    AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
    AddOutput("StatPosOut", "Statistic value when label = 1");
    AddOutput("StatNegOut", "Statistic value when label = 0");
    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
-    AddAttr<int>("num_thresholds",
+    AddAttr<int>(
-                 "The number of thresholds to use when discretizing the"
+        "num_thresholds",
-                 " roc curve.")
+        "The number of thresholds to use when discretizing the roc curve.")
        .SetDefault((2 << 12) - 1);
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
    int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");
    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+    auto stat_pos_calc = stat_pos_data.data();
-            num_thresholds, batch_auc);
+    auto stat_neg_calc = stat_neg_data.data();
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
  }
 private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }
-  inline static void calcAuc(const framework::ExecutionContext &ctx,
+  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *label,
                             const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
+                             const int num_pred_buckets,
-                             int num_thresholds,
+                             const int num_thresholds, const int slide_steps,
-                             framework::Tensor *auc_tensor) {
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
    const T *inference_data = predict->data<T>();
    const auto *label_data = label->data<int64_t>();
-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
    for (size_t i = 0; i < batch_size; i++) {
      uint32_t binIdx = static_cast<uint32_t>(
          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
      } else {
-        stat_neg[binIdx] += 1.0;
+        (*stat_neg)[binIdx] += 1.0;
      }
    }
+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
+    } else {
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
+    }
+  }
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
    *auc = 0.0f;
    double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
      totPos += stat_pos[idx];
      totNeg += stat_neg[idx];
      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
      --idx;
    }

--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace pf = paddle::framework;
-static constexpr char kChannel[] = "Channel";
-namespace paddle {
-namespace operators {
-class ChannelCloseOp : public framework::OperatorBase {
- public:
-  ChannelCloseOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &inp = *scope.FindVar(Input(kChannel));
-    // Get the mutable version of the channel variable and closes it.
-    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
-    ch->close();
-  }
-};
-class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("Channel"),
-                   "The input of ChannelClose op must be set");
-  }
-};
-class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kChannel,
-             "The Channel Variable that should be closed by"
-             " the ChannelClose Op.");
-    AddComment(R"DOC(
-Channel Close Operator.
-This operator closes an open channel.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCloseOpMaker);
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-namespace pf = paddle::framework;
-static constexpr char kOutput[] = "Out";
-namespace paddle {
-namespace operators {
-class ChannelCreateOp : public framework::OperatorBase {
- public:
-  ChannelCreateOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &out = *scope.FindVar(Output(kOutput));
-    // Determine the datatype and capacity of the channel to be created
-    // from the attributes provided.
-    auto dtype =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
-    auto capacity = Attr<int>("capacity");
-    // Based on the datatype, create a new channel holder initialized with
-    // the given capacity. When capacity is 0, an unbuffered channel is
-    // created.
-    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
-    if (dtype == framework::proto::VarType::LOD_TENSOR) {
-      ch->Reset<pf::LoDTensor>(capacity);
-    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
-      ch->Reset<pf::SelectedRows>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
-      ch->Reset<pf::LoDRankTable>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      ch->Reset<pf::LoDTensorArray>(capacity);
-    } else if (dtype == framework::proto::VarType::READER) {
-      ch->Reset<pf::ReaderHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::CHANNEL) {
-      ch->Reset<pf::ChannelHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      ch->Reset<bool>(capacity);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      ch->Reset<int>(capacity);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      ch->Reset<int64_t>(capacity);
-    } else if (dtype == framework::proto::VarType::FP32) {
-      ch->Reset<float>(capacity);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      ch->Reset<double>(capacity);
-    } else {
-      PADDLE_THROW(
-          "Data type %d is not in "
-          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
-          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
-          dtype);
-    }
-  }
-};
-class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasOutput(kOutput),
-                   "The output of ChannelCreate op must be set");
-    context->SetOutputDim(kOutput, {1});
-  }
-};
-class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput(kOutput,
-              "The object of a Channel type created by ChannelCreate Op.");
-    AddAttr<int>("capacity", "The size of the buffer of Channel.")
-        .SetDefault(0);
-    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
-    AddComment(R"DOC(
-Channel Create Operator.
-This operator creates an object of the VarType Channel and returns it.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCreateOpMaker);
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-static constexpr char Channel[] = "Channel";
-static constexpr char Status[] = "Status";
-static constexpr char Out[] = "Out";
-namespace paddle {
-namespace operators {
-void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable *status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                         cpu);
-  status_tensor[0] = status;
-}
-class ChannelRecvOp : public framework::OperatorBase {
- public:
-  ChannelRecvOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Out),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelRecvOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
-  }
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto output_var = scope.FindVar(Output(Out));
-    // Receive the data from the channel.
-    bool ok = concurrency::ChannelReceive(ch, output_var);
-    // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
-  }
-};
-class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"receives\" the a value sent"
-             "to it by a channel_send op.")
-        .AsDuplicable();
-    AddOutput(Out,
-              "(Variable) Output Variable that will hold the data received"
-              " from the Channel")
-        .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the receive operation.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelRecvOpMaker);
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-static constexpr char Channel[] = "Channel";
-static constexpr char X[] = "X";
-namespace paddle {
-namespace operators {
-class ChannelSendOp : public framework::OperatorBase {
- public:
-  ChannelSendOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(X),
-                   "Input(X) of ChannelSendOp should not be null.");
-  }
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto input_var = scope.FindVar(Input(X));
-    // Send the input data through the channel.
-    concurrency::ChannelSend(ch, input_var);
-  }
-};
-class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"sends\" the passed in value to "
-             "a listening receiver.")
-        .AsDuplicable();
-    AddInput(X, "(Variable) The value which gets sent by the channel.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelSendOpMaker);
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
-cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/framework/var_type.h"
-namespace poc = paddle::operators::concurrency;
-void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    ch->Send(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    ch->Send(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    ch->Send(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    ch->Send(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    ch->Send(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    ch->Send(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelSend:Unsupported type");
-}
-bool poc::ChannelReceive(framework::ChannelHolder *ch,
-                         framework::Variable *var) {
-  // Get type of channel and use that to call mutable data for Variable
-  auto type = framework::ToVarType(ch->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Receive(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Receive(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelReceive:Unsupported type");
-}
-void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                            framework::Variable *var,
-                            std::shared_ptr<std::condition_variable_any> cond,
-                            std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
-                   cb);
-  } else {
-    PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
-  }
-}
-void poc::ChannelAddToReceiveQ(
-    framework::ChannelHolder *ch, const void *referrer,
-    framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
-                      cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
-                      cond, cb);
-  } else {
-    PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
-  }
-}
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/variable.h"
-namespace paddle {
-namespace operators {
-namespace concurrency {
-void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
-bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
-void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                       framework::Variable *var,
-                       std::shared_ptr<std::condition_variable_any> cond,
-                       std::function<bool(framework::ChannelAction)> cb);
-void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
-                          framework::Variable *var,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(framework::ChannelAction)> cb);
-}  // namespace concurrency
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                  output);
  }
 };
@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, input_grad, static_cast<T>(0));
      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, input_grad);
+                             paddings, dilations, input_grad);
    }
    if (filter_grad) {
      filter_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
        depthwiseConvInputGrad;
    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           output);
+                           dilations, output);
  }
 };
@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
    if (input_grad) {
      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
                    input_grad);
    }
@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
          depthwiseConvFilterGrad;
      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+#include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+template <typename T, size_t ElementCount>
+struct Array {
+ public:
+  HOSTDEVICE inline Array() {}
+  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline const T& operator[](size_t index) const {
+    return data_[index];
+  }
+  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
+  template <typename VectorLikeType>
+  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
+    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
+    size_t n = static_cast<size_t>(vec.size());
+    Array<T, ElementCount> ret;
+    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+    return ret;
+  }
+ private:
+  T data_[ElementCount];
+};
+// reduce the last axis of 2d array
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
+                               TransformOp transformer, Ty init,
+                               int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
+                             TransformOp transformer, Ty init, int reduce_num,
+                             Array<int, Rank> x_strides,
+                             Array<int, ReduceRank> reduce_dim,
+                             Array<int, ReduceRank> reduce_strides,
+                             Array<int, Rank - ReduceRank> left_dim,
+                             Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  Array<int, Rank> sub_index;
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
+  }
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
+  int n = static_cast<int>(dims.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+constexpr int kMaxBlockDim = 512;
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void TensorReduceImpl(
+    const Tx* x_data, Ty* y_data, const platform::Place& place,
+    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
+    int left_num, int reduce_num, const std::vector<int>& x_strides,
+    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
+    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
+    cudaStream_t stream) {
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+#define CUB_REDUCE_RANK_CASE(i, ...)                              \
+  case i: {                                                       \
+    constexpr auto kReduceRank = i;                               \
+    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
+                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
+        x_data, y_data, reducer, transformer, init, reduce_num,   \
+        Array<int, kRank>::From(x_strides),                       \
+        Array<int, kReduceRank>::From(reduce_dim),                \
+        Array<int, kReduceRank>::From(reduce_strides),            \
+        Array<int, kRank - kReduceRank>::From(left_dim),          \
+        Array<int, kRank - kReduceRank>::From(left_strides));     \
+  } break
+  int rank = x_strides.size();
+  int reduce_rank = reduce_strides.size();
+  if (rank == reduce_rank) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    return;
+  }
+  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
+                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
+        x_data, y_data, reducer, transformer, init, reduce_num);
+    return;
+  }
+  /*
+  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
+    // Currently, it is handled by code below, but inefficient
+    return;
+  }
+  */
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3););
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5););
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
+                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
+  }
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+}  // namespace detail
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
+                  std::vector<int> origin_reduce_dims, const Ty& init,
+                  const ReduceOp& reducer, const TransformOp& transformer,
+                  cudaStream_t stream) {
+  auto x_dim = framework::vectorize2int(x.dims());
+  std::vector<int> new_x_dim, new_reduce_dims;
+  int is_reduced = 0;
+  for (auto e : origin_reduce_dims) {
+    auto pos = e >= 0 ? e : e + x_dim.size();
+    is_reduced |= 1 << e;
+  }
+  for (int i = 0; i < x_dim.size(); i++) {
+    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+      new_x_dim.push_back(x_dim[i]);
+      if ((is_reduced >> i) & 1)
+        new_reduce_dims.push_back(new_x_dim.size() - 1);
+    } else {
+      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+    }
+  }
+  x_dim = new_x_dim;
+  origin_reduce_dims = new_reduce_dims;
+  int x_rank = static_cast<int>(x_dim.size());
+  std::set<int> left_set, reduce_set;
+  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
+  for (auto e : origin_reduce_dims) {
+    left_set.erase(e);
+    reduce_set.insert(e);
+  }
+  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
+  std::vector<int> left_dim(left_set.begin(), left_set.end());
+  std::vector<int> x_strides = detail::GetStrides(x_dim);
+  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
+  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+  int left_num = 1;
+  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
+  std::vector<int> y_dim(left_dim.size());
+  for (int i = 0; i < left_dim.size(); ++i) {
+    y_dim[i] = x_dim[left_dim[i]];
+  }
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+  if (reduce_num == 1) return;
+#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
+  case block_dim: {                                                      \
+    constexpr auto kBlockDim = block_dim;                                \
+    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
+        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
+        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
+        left_strides, stream);                                           \
+  } break
+  switch (detail::GetDesiredBlockDim(reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <time.h>
+#include <atomic>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <time.h>
+#include <condition_variable>  // NOLINT
 #include <functional>
 #include <string>

--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <atomic>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator
+Elementwise %s Operator
 The equation is:

--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({total_T, D3});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
    INIT_VEC_FUNC
    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -424,11 +424,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({x_dims[0], D4});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_SIZES
    INIT_VEC_FUNC
    INIT_BASE_INPUT_DATAS

--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -20,149 +21,268 @@ namespace paddle {
 namespace operators {
 namespace math {
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#if CUDA_VERSION < 9000
+  for (int offset = 16; offset > 0; offset /= 2)
+    val += __shfl_down(val, offset);
+  return val;
+#else
+#define FULL_MASK 0xffffffff
+  for (int offset = 16; offset > 0; offset /= 2)
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  return val;
+#endif
+}
+__forceinline__ __device__ unsigned lane_id() {
+  unsigned ret;
+  asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
+  return ret;
+}
+__forceinline__ __device__ unsigned warp_id() {
+  unsigned ret;
+  asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
+  return ret;
+}
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
 template <typename T>
-__global__ void KernelDepthwiseConv(
+__device__ __inline__ void KernelDepthwiseConv(
-    const int nthreads, const T* const input_data, const T* const filter_data,
+    const T* const input_data, const T* const filter_data, const int batch_size,
-    const int batch_size, const int output_channels, const int output_height,
+    const int output_channels, const int output_height, const int output_width,
-    const int output_width, const int input_channels, const int input_height,
+    const int input_channels, const int input_height, const int input_width,
-    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_multiplier, const int filter_height,
    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T* const output_data) {
+    const int padding_height, const int padding_width, const int dilate_height,
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+    const int dilate_width, T* const output_data) {
+  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
-  if (index < nthreads) {
+    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
-    const int batch = index / output_channels / output_height / output_width;
+      const int batch = blockIdx.y;
-    const int c_out = (index / output_height / output_width) % output_channels;
+      const int c_out = blockIdx.x;
-    const int h_out = (index / output_width) % output_height;
-    const int w_out = index % output_width;
+      const int c_in = c_out / filter_multiplier;
+      const T* weight = filter_data + c_out * filter_height * filter_width;
-    const int c_in = c_out / filter_multiplier;
+      T value = 0;
-    const T* weight = filter_data + c_out * filter_height * filter_width;
+      const int h_in_start = -padding_height + h_out * stride_height;
-    T value = 0;
+      const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_start = -padding_height + h_out * stride_height;
+      const int h_in_end = h_in_start + filter_height * dilate_height;
-    const int w_in_start = -padding_width + w_out * stride_width;
+      const int w_in_end = w_in_start + filter_width * dilate_width;
-    const int h_in_end = h_in_start + filter_height;
-    const int w_in_end = w_in_start + filter_width;
+      const int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
-    const int in_offset =
-        ((batch * input_channels + c_in) * input_height) * input_width;
+      const int h_end = h_in_end < input_height ? h_in_end : input_height;
+      const int w_end = w_in_end < input_width ? w_in_end : input_width;
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+      const int h_start = h_in_start > 0 ? h_in_start : 0;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+      const int w_start = w_in_start > 0 ? w_in_start : 0;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
+      int weight_offset = 0;
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
+      for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
-    for (int h_in = h_start; h_in < h_end; h_in++) {
+        for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
-      for (int w_in = w_start; w_in < w_end; w_in++) {
+          if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
-        const int offset = in_offset + h_in * input_width + w_in;
+              w_in < w_end) {
-        value +=
+            const int offset = in_offset + h_in * input_width + w_in;
-            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            value += weight[weight_offset] * input_data[offset];
-            input_data[offset];
+          }
+          weight_offset++;
+        }
      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
    }
-    output_data[index] = value;
  }
 }
+template <typename T, int c_filter_multiplier, int c_stride>
+__global__ void KernelDepthwiseConvSp(
+    const T* const input_data, const T* const filter_data, const int batch_size,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const output_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
+                           output_height, output_width, input_channels,
+                           input_height, input_width, filter_multiplier,
+                           filter_height, filter_width, stride_height,
+                           stride_width, padding_height, padding_width,
+                           dilate_height, dilate_width, output_data);
+  else
+    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
+                           output_height, output_width, input_channels,
+                           input_height, input_width, c_filter_multiplier,
+                           filter_height, filter_height, c_stride, c_stride,
+                           padding_height, padding_width, dilate_height,
+                           dilate_width, output_data);
+}
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <typename T>
-__global__ void KernelDepthwiseConvInputGrad(
+__device__ __inline__ void KernelDepthwiseConvInputGrad(
-    const int nthreads, const T* const output_grad_data,
+    const T* const output_grad_data, const T* const filter_data,
-    const T* const filter_data, const int batch_size, const int output_channels,
+    const int batch_size, const int output_channels, const int output_height,
-    const int output_height, const int output_width, const int input_channels,
+    const int output_width, const int input_channels, const int input_height,
-    const int input_height, const int input_width, const int filter_multiplier,
+    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_height, const int filter_width, const int stride_height,
+    const int filter_width, const int stride_height, const int stride_width,
-    const int stride_width, const int padding_height, const int padding_width,
+    const int padding_height, const int padding_width, const int dilate_height,
-    T* const input_grad_data) {
+    const int dilate_width, T* const input_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
-  if (index < nthreads) {
+    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-    const int batch = index / input_channels / input_height / input_width;
+      const int batch = blockIdx.y;
-    const int c_in = (index / input_height / input_width) % input_channels;
+      const int c_in = blockIdx.x;
-    const int h_in = (index / input_width) % input_height;
-    const int w_in = index % input_width;
+      const int c_out_start = c_in * filter_multiplier;
-    const int c_out_start = c_in * filter_multiplier;
+      int h_out_start =
+          h_in - (filter_height - 1) * dilate_height + padding_height;
-    int h_out_start =
-        (h_in - filter_height + padding_height + stride_height) / stride_height;
+      int h_out_end = h_in + padding_height;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+      int w_out_start =
-    int h_out_end = (h_in + padding_height) / stride_height;
+          w_in - (filter_width - 1) * dilate_width + padding_width;
-    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
+      int w_out_end = w_in + padding_width;
-    int w_out_start =
-        (w_in - filter_width + padding_width + stride_width) / stride_width;
+      T value = 0;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+      for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
-    int w_out_end = (w_in + padding_width) / stride_width;
+           c_out++) {
-    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
+        int filter_offset = (c_out + 1) * filter_height * filter_width;
+        for (int h_out = h_out_start; h_out <= h_out_end;
-    T value = 0;
+             h_out += dilate_height) {
+          for (int w_out = w_out_start; w_out <= w_out_end;
-    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
+               w_out += dilate_width) {
-         c_out++) {
+            filter_offset--;
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+            int s_h_out = h_out / stride_height;
-        const int filter_h = h_in + padding_height - h_out * stride_height;
+            int s_w_out = w_out / stride_width;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
-          const int filter_w = w_in + padding_width - w_out * stride_width;
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
-          const int filter_offset = c_out * filter_height * filter_width +
+                s_w_out < output_width) {
-                                    filter_h * filter_width + filter_w;
+              const int output_grad_offset =
-          const int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
-              ((batch * output_channels + c_out) * output_height + h_out) *
+                   s_h_out) *
-                  output_width +
+                      output_width +
-              w_out;
+                  s_w_out;
-          value +=
+              value += output_grad_data[output_grad_offset] *
-              output_grad_data[output_grad_offset] * filter_data[filter_offset];
+                       filter_data[filter_offset];
+            }
+          }
        }
      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      input_grad_data[index] = value;
    }
-    input_grad_data[index] += value;
  }
 }
+template <typename T, int c_filter_multiplier, int c_stride>
+__global__ void KernelDepthwiseConvInputGradSp(
+    const T* const output_grad_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* const input_grad_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConvInputGrad<T>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, input_grad_data);
+  else
+    KernelDepthwiseConvInputGrad<T>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
+        padding_height, padding_width, dilate_height, dilate_width,
+        input_grad_data);
+}
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T>
-__global__ void KernelDepthwiseConvFilterGrad(
+__device__ __inline__ void KernelDepthwiseConvFilterGrad(
-    const int nthreads, const T* const output_grad_data,
+    const T* output_grad_data, const T* input_data, const int num,
-    const T* const input_data, const int num, const int output_channels,
+    const int output_channels, const int output_height, const int output_width,
-    const int output_height, const int output_width, const int input_channels,
+    const int input_channels, const int input_height, const int input_width,
-    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_multiplier, const int filter_height,
-    const int filter_height, const int filter_width, const int stride_height,
+    const int filter_width, const int stride_height, const int stride_width,
-    const int stride_width, const int padding_height, const int padding_width,
+    const int padding_height, const int padding_width, const int dilate_height,
-    T* const filter_grad_data) {
+    const int dilate_width, T* filter_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  T s = 0;
-  if (index < nthreads) {
-    const int w_out = index % output_width;
+  int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
-    const int h_out = (index / output_width) % output_height;
+  int lid = lane_id();
-    const int c_out = (index / output_width / output_height) % output_channels;
-    const int batch = (index / output_width / output_height / output_channels);
+  for (int image_w = threadIdx.x; image_w < output_width;
-    const int c_in = c_out / filter_multiplier;
+       image_w += blockDim.x) {
-    const int h_in_start = -padding_height + h_out * stride_height;
+    for (int bid = 0; bid < num; bid++) {
-    const int w_in_start = -padding_width + w_out * stride_width;
+      for (int image_h = threadIdx.y; image_h < output_height;
-    const int h_in_end =
+           image_h += blockDim.y) {
-        -padding_height + h_out * stride_height + filter_height;
+        int kernel_id = blockIdx.z;
-    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+        int kernel_h = blockIdx.y * dilate_height - padding_height;
-    const int in_offset =
+        int kernel_w = blockIdx.x * dilate_width - padding_width;
-        (batch * input_channels + c_in) * input_height * input_width;
+        int image_hk = image_h * stride_height + kernel_h;
-    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+        int image_wk = image_w * stride_width + kernel_w;
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+        if (image_hk < 0 || image_hk >= input_height) continue;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+        if (image_wk < 0 || image_wk >= input_width) continue;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
+#define gaid(N, C, H, W) \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
+  ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-    for (int h_in = h_start; h_in < h_end; h_in++) {
+        s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-      for (int w_in = w_start; w_in < w_end; w_in++) {
+             input_data[((bid * (gridDim.z / filter_multiplier) +
-        const int offset = in_offset + h_in * input_width + w_in;
+                          kernel_id / filter_multiplier) *
-        const T diff_temp = output_grad_data[index] * input_data[offset];
+                             input_height +
-        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                         image_hk) *
-                  (w_in - w_in_start);
+                            input_width +
-        paddle::platform::CudaAtomicAdd(addr, diff_temp);
+                        image_wk];
+#undef gaid
      }
    }
  }
+#if __CUDA_ARCH__ >= 530
+  s = warpReduceSum<T>(s);
+  if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+#else
+  paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+#endif
+}
+template <typename T, int c_filter_multiplier>
+__global__ void KernelDepthwiseConvFilterGradSp(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  if (c_filter_multiplier == 0)
+    KernelDepthwiseConvFilterGrad<T>(
+        output_grad_data, input_data, num, output_channels, output_height,
+        output_width, input_channels, input_height, input_width,
+        filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, filter_grad_data);
+  else
+    KernelDepthwiseConvFilterGrad<T>(
+        output_grad_data, input_data, num, output_channels, output_height,
+        output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, stride_height,
+        stride_width, padding_height, padding_width, dilate_height,
+        dilate_width, filter_grad_data);
 }
 /*
@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output) {
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];
    const T* input_data = input.data<T>();
    const T* filter_data = filter.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = batch_size * output_channels * output_height * output_width;
+    int thread = 512;
-    int blocks = (nthreads + 1024 - 1) / 1024;
+    int blocks = std::min(std::max(thread / output_width, 1), output_height);
-    dim3 threads(1024, 1);
+    dim3 threads(std::min(output_width, thread), blocks, 1);
-    dim3 grid(blocks, 1);
+    dim3 grid(output_channels, batch_size, 1);
+    int filter_multiplier = output_channels / input_channels;
-    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
+#define check_case(c_filter_multiplier, c_stride)                            \
-        nthreads, input_data, filter_data, batch_size, output_channels,
+  if (c_filter_multiplier == 0 ||                                            \
-        output_height, output_width, input_channels, input_height, input_width,
+      filter_multiplier == c_filter_multiplier &&                            \
-        output_channels / input_channels, ksize_height, ksize_width,
+          stride_height == stride_width && stride_height == c_stride) {      \
-        stride_height, stride_width, padding_height, padding_width,
+    KernelDepthwiseConvSp<T, c_filter_multiplier,                            \
-        output_data);
+                          c_stride><<<grid, threads, 0, context.stream()>>>( \
+        input_data, filter_data, batch_size, output_channels, output_height, \
+        output_width, input_channels, input_height, input_width,             \
+        filter_multiplier, ksize_height, ksize_width, stride_height,         \
+        stride_width, padding_height, padding_width, dilate_height,          \
+        dilate_width, output_data);                                          \
+    return;                                                                  \
+  }
+    check_case(1, 1);
+    check_case(1, 2);
+    // NOTE(liangdun): 0,0 for other case
+    // add other case if needed, e.g. check_case(2^n,1)
+    check_case(0, 0);
+#undef check_case
  }
 };
@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];
    const T* filter_data = filter.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = batch_size * input_channels * input_height * input_width;
+    int thread = 512;
-    int blocks = (nthreads + 1024 - 1) / 1024;
+    int blocks = std::min(std::max(thread / input_width, 1), input_height);
-    dim3 threads(1024, 1);
+    dim3 threads(std::min(input_width, thread), blocks, 1);
-    dim3 grid(blocks, 1);
+    dim3 grid(input_channels, batch_size, 1);
+    int filter_multiplier = output_channels / input_channels;
-    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, filter_data, batch_size, output_channels,
+#define check_case(c_filter_multiplier, c_stride)                       \
-        output_height, output_width, input_channels, input_height, input_width,
+  if (c_filter_multiplier == 0 ||                                       \
-        output_channels / input_channels, ksize_height, ksize_width,
+      filter_multiplier == c_filter_multiplier &&                       \
-        stride_height, stride_width, padding_height, padding_width,
+          stride_height == stride_width && stride_height == c_stride) { \
-        input_grad_data);
+    KernelDepthwiseConvInputGradSp<                                     \
+        T, c_filter_multiplier,                                         \
+        c_stride><<<grid, threads, 0, context.stream()>>>(              \
+        output_grad_data, filter_data, batch_size, output_channels,     \
+        output_height, output_width, input_channels, input_height,      \
+        input_width, filter_multiplier, ksize_height, ksize_width,      \
+        stride_height, stride_width, padding_height, padding_width,     \
+        dilate_height, dilate_width, input_grad_data);                  \
+    return;                                                             \
+  }
+    check_case(1, 1);
+    check_case(1, 2);
+    // NOTE(liangdun): 0,0 for other case
+    // add other case if needed, e.g. check_case(2^n,1)
+    check_case(0, 0);
+#undef check_case
  }
 };
@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];
+    const int dilate_height = dilations[0];
+    const int dilate_width = dilations[1];
    const T* input_data = input.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = batch_size * output_channels * output_height * output_width;
+    int block_size = 512;
+    int crop_output_height =
-    int blocks = (nthreads + 1024 - 1) / 1024;
+        std::min(std::max(block_size / output_width, 1), output_height);
-    dim3 threads(1024, 1);
+    dim3 grid(ksize_width, ksize_height, output_channels);
-    dim3 grid(blocks, 1);
+    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
+    int filter_multiplier = output_channels / input_channels;
-    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, input_data, batch_size, output_channels,
+#define check_case(c_filter_multiplier)                                       \
-        output_height, output_width, input_channels, input_height, input_width,
+  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
-        output_channels / input_channels, ksize_height, ksize_width,
+    KernelDepthwiseConvFilterGradSp<                                          \
-        stride_height, stride_width, padding_height, padding_width,
+        T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>(      \
-        filter_grad_data);
+        output_grad_data, input_data, batch_size, output_channels,            \
+        output_height, output_width, input_channels, input_height,            \
+        input_width, filter_multiplier, ksize_height, ksize_width,            \
+        stride_height, stride_width, padding_height, padding_width,           \
+        dilate_height, dilate_width, filter_grad_data);                       \
+    return;                                                                   \
+  }
+    check_case(1);
+    check_case(0);
+#undef check_case
  }
 };

--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output);
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations, framework::Tensor* output);
 };
 template <typename DeviceContext, typename T>
@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad);
 };
@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad);
 };

--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -12,17 +12,64 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <vector>
+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_mean_op.h"
-REGISTER_OP_CUDA_KERNEL(reduce_mean,
+namespace paddle {
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+namespace operators {
-                                          float, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+template <typename T>
-                                          double, ops::MeanFunctor>,
+struct DivideFunctor {
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-                                          int, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-                                          int64_t, ops::MeanFunctor>);
+ private:
+  T n_inv;
+};
+template <typename T>
+class ReduceMeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        DivideFunctor<T>(reduce_num), stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
+                        ops::ReduceMeanKernel<double>,
+                        ops::ReduceMeanKernel<int>,
+                        ops::ReduceMeanKernel<int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,

--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -12,17 +12,59 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_sum_op.h"
-REGISTER_OP_CUDA_KERNEL(reduce_sum,
+namespace paddle {
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+namespace operators {
-                                          float, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+template <typename T>
-                                          double, ops::SumFunctor>,
+struct IdentityFunctor {
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+  HOSTDEVICE explicit inline IdentityFunctor() {}
-                                          int, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
-                                          int64_t, ops::SumFunctor>);
+};
+template <typename T>
+class ReduceSumKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        IdentityFunctor<T>(), stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
+                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
+                        ops::ReduceSumKernel<int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
    auto out_var_name = op_desc.Output("Out").front();
    auto *out_var = block->FindVarRecursive(out_var_name);
-    out_var->SetType(in_var.GetType());
+    if (in_var_name != out_var_name) {
-    out_var->SetDataType(in_var.GetDataType());
+      out_var->SetType(in_var.GetType());
+      out_var->SetDataType(in_var.GetDataType());
+    }
  }
 };

--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <thread>  // NOLINT
-#include <vector>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include <boost/tokenizer.hpp>
-namespace paddle {
-namespace operators {
-static constexpr char kX[] = "X";
-static constexpr char kCaseToExecute[] = "case_to_execute";
-static constexpr char kOutputs[] = "Out";
-static constexpr char kCases[] = "cases";
-static constexpr char kCasesBlock[] = "sub_block";
-class SelectOp : public framework::OperatorBase {
- public:
-  SelectOp(const std::string &type, const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  enum class SelectOpCaseType {
-    DEFAULT = 0,
-    SEND = 1,
-    RECEIVE = 2,
-  };
-  struct SelectOpCase {
-    int caseIndex;
-    SelectOpCaseType caseType;
-    std::string channelName;
-    std::string varName;
-    SelectOpCase() {}
-    SelectOpCase(int caseIndex, SelectOpCaseType caseType,
-                 std::string channelName, std::string varName)
-        : caseIndex(caseIndex),
-          caseType(caseType),
-          channelName(channelName),
-          varName(varName) {}
-  };
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    std::vector<std::string> casesConfigs =
-        Attr<std::vector<std::string>>(kCases);
-    framework::BlockDesc *casesBlock =
-        Attr<framework::BlockDesc *>(kCasesBlock);
-    framework::Scope &casesBlockScope = scope.NewScope();
-    std::string caseToExecuteVarName = Input(kCaseToExecute);
-    framework::Variable *caseToExecuteVar =
-        casesBlockScope.FindVar(caseToExecuteVarName);
-    // Construct cases from "conditional_block_op"(s) in the casesBlock
-    std::vector<std::shared_ptr<SelectOpCase>> cases =
-        ParseAndShuffleCases(&casesConfigs);
-    // Get all unique channels involved in select
-    std::set<framework::ChannelHolder *> channelsSet;
-    for (auto c : cases) {
-      if (!c->channelName.empty()) {
-        auto channelVar = scope.FindVar(c->channelName);
-        framework::ChannelHolder *ch =
-            channelVar->GetMutable<framework::ChannelHolder>();
-        if (channelsSet.find(ch) == channelsSet.end()) {
-          channelsSet.insert(ch);
-        }
-      }
-    }
-    // Order all channels by their pointer address
-    std::vector<framework::ChannelHolder *> channels(channelsSet.begin(),
-                                                     channelsSet.end());
-    std::sort(channels.begin(), channels.end());
-    // Poll all cases
-    int32_t caseToExecute = pollCases(&scope, &cases, channels);
-    // At this point, the case to execute has already been determined,
-    // so we can proceed with executing the cases block
-    framework::LoDTensor *caseToExecuteTensor =
-        caseToExecuteVar->GetMutable<framework::LoDTensor>();
-    caseToExecuteTensor->data<int32_t>()[0] = caseToExecute;
-    // Execute the cases block, only one case will be executed since we set the
-    // case_to_execute value to the index of the case we want to execute
-    framework::Executor executor(dev_place);
-    framework::ProgramDesc *program = casesBlock->Program();
-    executor.Run(*program, &casesBlockScope, casesBlock->ID(),
-                 false /*create_local_scope*/);
-  }
-  /**
-   * Goes through all operators in the casesConfigs and processes
-   * "conditional_block" operators.  These operators are mapped to our
-   * SelectOpCase objects.  We randomize the case orders, and set the
-   * default case (if any exists) as the last case)
-   * @param casesBlock
-   * @return
-   */
-  std::vector<std::shared_ptr<SelectOpCase>> ParseAndShuffleCases(
-      std::vector<std::string> *casesConfigs) const {
-    std::vector<std::shared_ptr<SelectOpCase>> cases;
-    std::shared_ptr<SelectOpCase> defaultCase;
-    if (casesConfigs != nullptr) {
-      boost::char_delimiters_separator<char> sep(false, ",", "");
-      for (std::vector<std::string>::iterator itr = casesConfigs->begin();
-           itr < casesConfigs->end(); ++itr) {
-        std::string caseConfig = *itr;
-        boost::tokenizer<> tokens(caseConfig, sep);
-        boost::tokenizer<>::iterator tok_iter = tokens.begin();
-        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index");
-        std::string caseIndexString = *tok_iter;
-        int caseIndex = std::stoi(caseIndexString);
-        ++tok_iter;
-        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type");
-        std::string caseTypeString = *tok_iter;
-        SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString);
-        std::string caseChannel;
-        std::string caseChannelVar;
-        ++tok_iter;
-        if (caseType != SelectOpCaseType::DEFAULT) {
-          PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel");
-          caseChannel = *tok_iter;
-          ++tok_iter;
-          PADDLE_ENFORCE(tok_iter != tokens.end(),
-                         "Cannot get case channel variable");
-          caseChannelVar = *tok_iter;
-        }
-        auto c = std::make_shared<SelectOpCase>(caseIndex, caseType,
-                                                caseChannel, caseChannelVar);
-        if (caseType == SelectOpCaseType::DEFAULT) {
-          PADDLE_ENFORCE(defaultCase == nullptr,
-                         "Select can only contain one default case.");
-          defaultCase = c;
-        } else {
-          cases.push_back(c);
-        }
-      }
-    }
-    // Randomly sort cases, with default case being last
-    std::random_shuffle(cases.begin(), cases.end());
-    if (defaultCase != nullptr) {
-      cases.push_back(defaultCase);
-    }
-    return cases;
-  }
-  /**
-   * This method will recursively poll the cases and determines if any case
-   * condition is true.
-   * If none of the cases conditions are true (and there is no default case),
-   * then block
-   * the thread.  The thread may be woken up by a channel operation, at which
-   * point we
-   * execute the case.
-   * @param scope
-   * @param cases
-   * @param channels
-   * @return
-   */
-  int32_t pollCases(const framework::Scope *scope,
-                    std::vector<std::shared_ptr<SelectOpCase>> *cases,
-                    std::vector<framework::ChannelHolder *> channels) const {
-    // Lock all involved channels
-    lockChannels(channels);
-    std::atomic<int> caseToExecute(-1);
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND:
-          PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel");
-          if (ch->CanSend()) {
-            // We can send to channel directly, send the data to channel
-            // and execute case
-            auto chVar = scope->FindVar(c->varName);
-            concurrency::ChannelSend(ch, chVar);
-            caseToExecute = c->caseIndex;
-          }
-          break;
-        case SelectOpCaseType::RECEIVE:
-          if (ch->CanReceive()) {
-            // We can receive from channel directly, send the data to channel
-            // and execute case
-            auto chVar = scope->FindVar(c->varName);
-            concurrency::ChannelReceive(ch, chVar);
-            caseToExecute = c->caseIndex;
-          }
-          break;
-        case SelectOpCaseType::DEFAULT:
-          caseToExecute = c->caseIndex;
-          break;
-      }
-      if (caseToExecute != -1) {
-        // We found a case to execute, stop looking at other case statements
-        break;
-      }
-      ++it;
-    }
-    if (caseToExecute == -1) {
-      // None of the cases are eligible to execute, enqueue current thread
-      // into all the sending/receiving queue of each involved channel
-      std::atomic<bool> completed(false);
-      std::recursive_mutex mutex;
-      std::unique_lock<std::recursive_mutex> lock{mutex};
-      // std::condition_variable_any selectCond;
-      auto selectCond = std::make_shared<std::condition_variable_any>();
-      std::recursive_mutex callbackMutex;
-      pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
-                                &completed, &callbackMutex);
-      // TODO(thuan): Atomically unlock all channels and sleep current thread
-      unlockChannels(channels);
-      selectCond->wait(lock, [&completed]() { return completed.load(); });
-      // Select has been woken up by case operation
-      lockChannels(channels);
-      removeThreadOnChannelQueues(scope, cases);
-      if (caseToExecute == -1) {
-        // Recursively poll cases, since we were woken up by a channel close
-        // TODO(thuan): Need to test if this is a valid case
-        unlockChannels(channels);
-        return pollCases(scope, cases, channels);
-      }
-    }
-    // At this point, caseToExecute != -1, and we can proceed with executing
-    // the case block
-    unlockChannels(channels);
-    return caseToExecute;
-  }
-  void lockChannels(std::vector<framework::ChannelHolder *> chs) const {
-    std::vector<framework::ChannelHolder *>::iterator it = chs.begin();
-    while (it != chs.end()) {
-      framework::ChannelHolder *ch = *it;
-      ch->Lock();
-      ++it;
-    }
-  }
-  void unlockChannels(std::vector<framework::ChannelHolder *> chs) const {
-    std::vector<framework::ChannelHolder *>::reverse_iterator it = chs.rbegin();
-    while (it != chs.rend()) {
-      framework::ChannelHolder *ch = *it;
-      ch->Unlock();
-      ++it;
-    }
-  }
-  void pushThreadOnChannelQueues(
-      const framework::Scope *scope,
-      std::vector<std::shared_ptr<SelectOpCase>> *cases,
-      std::shared_ptr<std::condition_variable_any> rCond,
-      std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
-      std::recursive_mutex *callbackMutex) const {
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-      std::function<bool(framework::ChannelAction channelAction)> cb =
-          [&caseToExecute, &completed, &callbackMutex,
-           c](framework::ChannelAction channelAction) {
-            std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
-            bool canProcess = false;
-            if (!(*completed)) {
-              // If the channel wasn't closed, we set the caseToExecute index
-              // as this current case
-              if (channelAction != framework::ChannelAction::CLOSE) {
-                *caseToExecute = c->caseIndex;
-              }
-              // This will allow our conditional variable to break out of wait
-              *completed = true;
-              canProcess = true;
-            }
-            return canProcess;
-          };
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND: {
-          auto chOutputVar = scope->FindVar(c->varName);
-          concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb);
-          break;
-        }
-        case SelectOpCaseType::RECEIVE: {
-          auto chOutputVar = scope->FindVar(c->varName);
-          concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb);
-          break;
-        }
-        default:
-          break;
-      }
-      ++it;
-    }
-  }
-  void removeThreadOnChannelQueues(
-      const framework::Scope *scope,
-      std::vector<std::shared_ptr<SelectOpCase>> *cases) const {
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND: {
-          ch->RemoveFromSendQ(this);
-          break;
-        }
-        case SelectOpCaseType::RECEIVE: {
-          ch->RemoveFromReceiveQ(this);
-          break;
-        }
-        default:
-          break;
-      }
-      ++it;
-    }
-  }
-};
-class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kX,
-             "A set of variables, which are required by operators inside the "
-             "cases of Select Op")
-        .AsDuplicable();
-    AddInput(kCaseToExecute,
-             "(Int) The variable the sets the index of the case to execute, "
-             "after evaluating the channels being sent to and received from")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "A set of variables, which will be assigned with values "
-              "generated by the operators inside the cases of Select Op.")
-        .AsDuplicable();
-    AddAttr<std::vector<std::string>>(kCases,
-                                      "(String vector) Serialized list of"
-                                      "all cases in the select op. Each"
-                                      "case is serialized as: "
-                                      "'<index>,<type>,<channel>,<value>'"
-                                      "where type is 0 for default, 1 for"
-                                      "send, and 2 for receive"
-                                      "No channel and values are needed for"
-                                      "default cases.");
-    AddAttr<framework::BlockDesc *>(kCasesBlock,
-                                    "The cases block inside select_op");
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-// TODO(thuan): Implement Gradient Operator for SELECT_OP
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(select, paddle::operators::SelectOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::SelectOpMaker);
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }
    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
+    size_t in_num = in_vars.size();
    auto out_var = context.OutputVar("Out");
    bool in_place = out_var == in_vars[0];
@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
      auto &place =
          *context.template device_context<DeviceContext>().eigen_device();
      // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          if (in_t.numel() == 0) {
@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
      // Runtime InferShape
      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        first_dim += sel_row.rows().size();
      }
      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() > 0) {
          in_dim = framework::vectorize(sel_row.value().dims());
@@ -116,7 +116,8 @@ class SumKernel : public framework::OpKernel<T> {
      }
      if (in_dim.empty()) {
        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+        in_dim =
+            framework::vectorize(get_selected_row(in_num - 1).value().dims());
      } else {
        in_dim[0] = static_cast<int64_t>(first_dim);
      }
@@ -133,7 +134,7 @@ class SumKernel : public framework::OpKernel<T> {
      math::SelectedRowsAddTo<DeviceContext, T> functor;
      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() == 0) {
          continue;

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,8 +22,6 @@
 namespace paddle {
 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
 namespace operators {
@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch_size", "the maximum batch size.");
+    AddAttr<int>("workspace_size", "the workspace size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,8 +28,6 @@
 namespace paddle {
 DECLARE_int32(tensorrt_engine_batch_size);
-DECLARE_int32(tensorrt_max_batch_size);
-DECLARE_int32(tensorrt_workspace_size);
 namespace operators {
@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    int max_batch_size = context.Attr<int>("max_batch_size");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
-                      FLAGS_tensorrt_max_batch_size);
    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = FLAGS_tensorrt_max_batch_size;
+    int max_batch_size = context.Attr<int>("max_batch_size");
-    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    int workspace_size = context.Attr<int>("workspace_size");
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // TODO(Superjomn) replace this with a different stream
    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
        context.Attr<std::string>("engine_uniq_key"),
        boost::get<platform::CUDAPlace>(context.GetPlace()).device);

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;
 TEST(TensorRTEngineOp, manual) {
-  FLAGS_tensorrt_engine_batch_size = 2;
-  FLAGS_tensorrt_max_batch_size = 2;
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
 }
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  FLAGS_tensorrt_engine_batch_size = batch_size;
-  FLAGS_tensorrt_max_batch_size = batch_size;
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CUDAPlace place;
@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::vector<std::string>>(
      engine_op_desc.Proto(), "parameters",
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) {
      .def("set_shapes", &pd::VarDesc::SetShapes)
      .def("set_dtype", &pd::VarDesc::SetDataType)
      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
-      .def("set_capacity", &pd::VarDesc::SetCapacity)
      .def("shape", &pd::VarDesc::GetShape,
           pybind11::return_value_policy::reference)
      .def("shapes", &pd::VarDesc::GetShapes,
@@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) {
      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", pd::proto::VarType::CHANNEL)
      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
      .value("READER", pd::proto::VarType::READER)
      .value("RAW", pd::proto::VarType::RAW);
@@ -285,12 +283,12 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_output", &pd::OpDesc::SetOutput)
      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("rename_input", &pd::OpDesc::RenameInput)
+      .def("_rename_input", &pd::OpDesc::RenameInput)
-      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("_rename_output", &pd::OpDesc::RenameOutput)
      .def("has_attr", &pd::OpDesc::HasAttr)
      .def("attr_type", &pd::OpDesc::GetAttrType)
      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("_set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
@@ -300,8 +298,8 @@ void BindOpDesc(pybind11::module *m) {
             std::string ser(seriralized);
             self.SetAttr(name, ser);
           })
-      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
-      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
+      .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
      .def("check_attrs", &pd::OpDesc::CheckAttrs)
      .def("infer_shape", &pd::OpDesc::InferShape)
      .def("infer_var_type", &pd::OpDesc::InferVarType)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"

--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -16,7 +16,11 @@ endfunction()
 trainer_test(test_Compare)
 trainer_test(test_PyDataProviderWrapper)
 trainer_test(test_recurrent_machine_generation)
-trainer_test(test_Trainer)
+if(NOT APPLE)
+  trainer_test(test_Trainer)
+else()
+  message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") 
+endif()
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -70,8 +70,8 @@ function cmake_gen() {
    PYTHON_FLAGS=""
    SYSTEM=`uname -s`
    if [ "$SYSTEM" == "Darwin" ]; then
+        echo "using python abi: $1"
        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            echo "using python abi: $1"
            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
@@ -82,7 +82,18 @@ function cmake_gen() {
            else
                exit 1
            fi
-        # TODO: qiyang add python3 part here 
+        elif [ "$1" == "cp35-cp35m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
        fi
    else 
        if [ "$1" != "" ]; then
@@ -384,10 +395,11 @@ EOF
        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 
        make install -j 8
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi
+        pip uninstall -y paddlepaddle
    fi
 }
@@ -586,7 +598,7 @@ EOF
 EOF
    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
    else
        NCCL_DEPS=""
    fi
@@ -602,9 +614,8 @@ EOF
    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
    ADD python/dist/*.whl /
    # run paddle version to install python packages first
-    RUN apt-get update &&\
+    RUN apt-get update && ${NCCL_DEPS}
-        ${NCCL_DEPS}\
+    RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
-        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \
@@ -735,7 +746,7 @@ function main() {
        cmake_gen ${PYTHON_ABI:-""}
        build
        run_test
-        assert_api_not_changed
+        assert_api_not_changed ${PYTHON_ABI:-""}
        ;;
      *)
        print_usage

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print("file md5", md5file(filename), md5sum)
+            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print("Cache file %s not found, downloading %s" % (filename, url))
+        sys.stderr.write("Cache file %s not found, downloading %s" %
+                         (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')
@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                    sys.stderr.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
+    sys.stderr.write("\n")
+    sys.stdout.flush()
    return filename

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
        op_desc = op_descs[i]
        if isinstance(op_desc, tuple):
            op_desc = op_desc[0]
-        op_desc.rename_input(old_name, new_name)
+        op_desc._rename_input(old_name, new_name)
-        op_desc.rename_output(old_name, new_name)
+        op_desc._rename_output(old_name, new_name)
 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
-            op_desc.set_attr(name, val)
+            op_desc._set_attr(name, val)
    return op_desc
@@ -346,7 +346,7 @@ def _append_backward_ops_(block,
        grad_sub_block_list = []
        # If the op has its own sub-block, deal with the sub-block first
        if op.has_attr("sub_block"):
-            sub_block = program.block(op.block_attr_id("sub_block"))
+            sub_block = program.block(op._block_attr_id("sub_block"))
            grad_sub_block = program._create_block()
            grad_sub_block._set_forward_block_idx(sub_block.idx)
            cb = _callback_lookup_(op)
@@ -382,7 +382,7 @@ def _append_backward_ops_(block,
    for op_desc in grad_op_descs:
        new_op_desc = target_block.desc.append_op()
        new_op_desc.copy_from(op_desc)
-        new_op_desc.set_attr(op_role_attr_name, backward)
+        new_op_desc._set_attr(op_role_attr_name, backward)
        grad_to_var["__current_op_desc__"] = new_op_desc
        if callbacks is not None:
            assert (isinstance(callbacks, list))
@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
    for op_idx in range(start_op_idx, block.desc.op_size()):
        op_desc = block.desc.op(op_idx)
        if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
+            sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
        new_vars = set()
        # create new gradient variables
@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
        op_desc = block.desc.op(op_idx)
        for name in op_desc.input_arg_names():
            if name in var_map:
-                op_desc.rename_input(name, var_map[name])
+                op_desc._rename_input(name, var_map[name])
        for name in op_desc.output_arg_names():
            if block.desc.find_var(name.encode("ascii")):
                new_name = unique_name.generate(name)
-                op_desc.rename_output(name, new_name)
+                op_desc._rename_output(name, new_name)
                var_map[name] = new_name
    for g, ng in six.iteritems(var_map):
@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        if loss.op is None:
            raise ValueError("loss.op is None. Should not happend")
-    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+    loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
-                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                      int(core.op_proto_and_checker_maker.OpRole.Forward) |
-                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+                      int(core.op_proto_and_checker_maker.OpRole.Loss))
    if callbacks is not None:
        isinstance(callbacks, list)
@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        attr_val = [p.name, g.name]
        if g.op.has_attr(op_role_var_attr_name):
            attr_val.extend(g.op.attr(op_role_var_attr_name))
-        g.op.set_attr(op_role_var_attr_name, attr_val)
+        g.op._set_attr(op_role_var_attr_name, attr_val)
    return params_and_grads

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc._set_attr("min", self.min)
-        clip_op_desc.set_attr("max", self.max)
+        clip_op_desc._set_attr("max", self.max)
 def error_clip_callback(block, context):
@@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
-        local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0))
+        square = grad * grad
+        local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
        context[self.group_name].append(local_norm_var)
        self.context = context
@@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
            group_norm_var = layers.sqrt(x=group_norm_var)
+            group_norm_var = layers.cast(group_norm_var, 'float32')
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-from .layers.control_flow import BlockGuard, equal
-from .framework import Operator
-from .layer_helper import LayerHelper, unique_name
-from .layers import fill_constant
-from . import core
-__all__ = [
-    'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
-]
-class Go(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper("go", name=name)
-        super(Go, self).__init__(self.helper.main_program)
-    def __enter__(self):
-        super(Go, self).__enter__()
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self._construct_go_op()
-        return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
-    def _construct_go_op(self):
-        main_program = self.helper.main_program
-        go_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-        inner_outputs = set()
-        x_name_list = set()
-        for op in go_block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the Go operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-        # Iterate over all operators , get all the outputs
-        # add to the output list of Go operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
-        parent_block.append_op(
-            type='go',
-            inputs={
-                'X': [
-                    parent_block._var_recursive(x_name)
-                    for x_name in x_name_list
-                ]
-            },
-            outputs={},
-            attrs={'sub_block': go_block})
-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-    def __init__(self,
-                 select,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None,
-                 is_copy=False):
-        self.select = select
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else self.DEFAULT
-        X = value
-        if self.action == self.SEND and is_copy:
-            # We create of copy of the data we want to send
-            copied_X = self.select.parent_block.create_var(
-                name=unique_name.generate(value.name + '_copy'),
-                type=value.type,
-                dtype=value.dtype,
-                shape=value.shape,
-                lod_level=value.lod_level,
-                capacity=value.capacity
-                if hasattr(value, 'capacity') else None, )
-            self.select.parent_block.append_op(
-                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
-            X = copied_X
-        self.value = X
-        self.channel = channel
-    def __enter__(self):
-        self.block = self.main_program._create_block()
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program._rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.parent_block = self.helper.main_program.current_block()
-        self.cases = []
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-    def case(self, channel_action_fn, channel, value, is_copy=False):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(self,
-                                 len(self.cases), self.case_to_execute,
-                                 channel_action_fn, channel, value, is_copy)
-        self.cases.append(select_case)
-        return select_case
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
-        self.cases.append(default_case)
-        return default_case
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-        intermediate = set()
-        params = set()
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-        X = [select_block._var_recursive(x_name) for x_name in params]
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={'Out': out_list})
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
-def make_channel(dtype, capacity=0):
-    """
-    Helps implementation of a concurrent program by creating a "channel" of
-    a defined data type. Channels allow for the passing of data in
-    concurrent scenarios - such as when using threads to divide computation.
-    Channels can be used to "send" and "receive" such data concurrently.
-    There are two kinds of channels: unbuffered and buffered. Unbuffered
-    channels have no capacity - and thus, block on send and only unblock only
-    once what they have sent has been received.
-    On the other hand, buffered channels are initialized with a capacity -
-    and do not block on sends.
-    Use this method in combination with `channel_send`, `channel_recv`,
-    `channel_close`, and `Go` to design a concurrent Paddle program.
-    Args:
-        dtype (ParamAttr|string): Data type of the data sent in the channel.
-        This data type should be the string name of a numpy data type.
-        capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
-        to create an unbuffered channel.
-    Returns:
-        Variable: The channel variable that can be used to send an receive data
-                  of the defined dtype.
-    Examples:
-        .. code-block:: python
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to execute in a Go block, which receives the channel data.
-          fluid.channel_send(ch, 100)
-          fluid.channel_close(ch)
-    """
-    helper = LayerHelper('channel_create', **locals())
-    main_program = helper.main_program
-    make_channel_block = main_program.current_block()
-    # Make a channel variable (using the channel data type) and make sure it
-    # persists into the global scope.
-    channel = helper.create_variable(
-        name=unique_name.generate('channel'),
-        type=core.VarDesc.VarType.CHANNEL,
-        persistable=True)
-    create_channel_op = make_channel_block.append_op(
-        type="channel_create",
-        outputs={"Out": channel},
-        attrs={"data_type": dtype,
-               "capacity": capacity})
-    return channel
-def channel_send(channel, value, is_copy=False):
-    """
-    Sends a value through a channel variable. Used by an unbuffered or buffered
-    channel to pass data from within or to a concurrent Go block, where
-    `channel_recv` to used to get the passed value.
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-        value (Variable): Value to send to channel
-        is_copy (bool): Copy data while channel send. If False, then data
-        is moved. The input cannot be used after move. (default False)
-    Returns:
-        Variable: The boolean status on whether or not the channel
-                  successfully sent the passed value.
-    Examples:
-        .. code-block:: python
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to execute in a Go block, which receives the channel data.
-          fluid.channel_send(ch, 100)
-    """
-    helper = LayerHelper('channel_send', **locals())
-    main_program = helper.main_program
-    channel_send_block = main_program.current_block()
-    X = value
-    if is_copy:
-        copied_X = helper.create_variable(
-            name=unique_name.generate(value.name + '_copy'),
-            type=value.type,
-            dtype=value.dtype,
-            shape=value.shape,
-            lod_level=value.lod_level,
-            capacity=value.capacity if hasattr(value, 'capacity') else None)
-        assign_op = channel_send_block.append_op(
-            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
-        X = copied_X
-    channel_send_block.append_op(
-        type="channel_send", inputs={
-            "Channel": channel,
-            "X": X,
-        })
-def channel_recv(channel, return_value):
-    """
-    Receives a value through a channel variable. Used by an unbuffered or
-    buffered channel within a concurrent Go block to get data from originally
-    sent using `channel_send`, or from outside such a block where
-    `channel_send` is used to send the value.
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-        return_value (Variable): Variable to set as a result of running channel_recv_op
-    Returns:
-        Variable: The received value from the channel.
-        Variable: The boolean status on whether or not the channel
-                  successfully received the passed value.
-    Examples:
-        .. code-block:: python
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          with fluid.Go():
-            returned_value, return_status = fluid.channel_recv(ch, 'int32')
-          # Code to send data through the channel.
-    """
-    helper = LayerHelper('channel_recv', **locals())
-    main_program = helper.main_program
-    channel_recv_block = main_program.current_block()
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
-    channel_recv_op = channel_recv_block.append_op(
-        type="channel_recv",
-        inputs={"Channel": channel},
-        outputs={"Out": return_value,
-                 "Status": status})
-    return return_value, status
-def channel_close(channel):
-    """
-    Closes a channel created using `make_channel`.
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-    Examples:
-        .. code-block:: python
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to receive and send data through a channel
-          ...
-          fluid.channel_close(ch)
-    """
-    helper = LayerHelper('channel_close', **locals())
-    main_program = helper.main_program
-    channel_close_block = main_program.current_block()
-    channel_close_op = channel_close_block.append_op(
-        type="channel_close", inputs={"Channel": channel})
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -37,11 +37,9 @@ from . import unique_name
 __all__ = [
    'Program',
-    'Operator',
    'default_startup_program',
    'default_main_program',
    'program_guard',
-    'get_var',
    'name_scope',
 ]
@@ -539,8 +537,7 @@ class Operator(object):
        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
-        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
-        'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
    }
    def __init__(self,
@@ -654,11 +651,11 @@ class Operator(object):
                self._update_desc_attr(attr_name, attr_val)
        self.desc.check_attrs()
-        if self.has_kernel(type):
+        if self._has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)
-    def has_kernel(self, op_type):
+    def _has_kernel(self, op_type):
        return op_type not in self.OP_WITHOUT_KERNEL_SET
    def to_string(self, throw_on_error):
@@ -699,7 +696,7 @@ class Operator(object):
        """
        return self.desc.input(name)
-    def rename_input(self, old_name, new_name):
+    def _rename_input(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.
@@ -710,9 +707,9 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_input(old_name, new_name)
+        self.desc._rename_input(old_name, new_name)
-    def rename_output(self, old_name, new_name):
+    def _rename_output(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.
@@ -723,7 +720,7 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_output(old_name, new_name)
+        self.desc._rename_output(old_name, new_name)
    @property
    def input_names(self):
@@ -787,7 +784,7 @@ class Operator(object):
        """
        return self.desc.attr_type(name)
-    def set_attr(self, name, val):
+    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.
@@ -820,7 +817,7 @@ class Operator(object):
                isinstance(val, core.ProgramDesc):
            self.desc.set_serialized_attr(name, val.serialize_to_string())
        else:
-            self.desc.set_attr(name, val)
+            self.desc._set_attr(name, val)
    @property
    def attr_names(self):
@@ -839,7 +836,7 @@ class Operator(object):
        """
        return self.desc.attr(name)
-    def block_attr_id(self, name):
+    def _block_attr_id(self, name):
        """
        Get the block attribute's id by name.
@@ -849,9 +846,9 @@ class Operator(object):
        Returns:
            int: the block index.
        """
-        return self.desc.block_attr_id(name)
+        return self.desc._block_attr_id(name)
-    def block_attr(self, name):
+    def _block_attr(self, name):
        """
        Get the block attribute  by name.
@@ -862,11 +859,11 @@ class Operator(object):
            block: the block attribute.
        """
-        id = self.block_attr_id(name)
+        id = self._block_attr_id(name)
        assert (id >= 0 and id < len(self.block.program.blocks))
        return self.block.program.blocks[id]
-    def blocks_attr(self, name):
+    def _blocks_attr(self, name):
        """
        Get the blocks attribute  by name.
@@ -877,13 +874,13 @@ class Operator(object):
            list: list of the blocks attribute.
        """
        attrs = []
-        for i in self.blocks_attr_ids(name):
+        for i in self._blocks_attr_ids(name):
            assert (i >= 0 and i < len(self.block.program.blocks))
            attrs.append(self.block.program.blocks[i])
        return attrs
-    def blocks_attr_ids(self, name):
+    def _blocks_attr_ids(self, name):
        """
        Get the blocks attribute's ids by name.
@@ -894,7 +891,7 @@ class Operator(object):
            list: list of the blocks ids.
        """
-        return self.desc.blocks_attr_ids(name)
+        return self.desc._blocks_attr_ids(name)
    def all_attrs(self):
        """
@@ -908,11 +905,11 @@ class Operator(object):
        for n in attr_names:
            attr_type = self.desc.attr_type(n)
            if attr_type == core.AttrType.BLOCK:
-                attr_map[n] = self.block_attr(n)
+                attr_map[n] = self._block_attr(n)
                continue
            if attr_type == core.AttrType.BLOCKS:
-                attr_map[n] = self.blocks_attr(n)
+                attr_map[n] = self._blocks_attr(n)
                continue
            attr_map[n] = self.attr(n)
@@ -1786,7 +1783,7 @@ class Program(object):
            for j in six.moves.range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
-                    op.set_attr('is_test', True)
+                    op._set_attr('is_test', True)
        res.blocks = [
            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
        ]
@@ -2160,7 +2157,7 @@ def program_guard(main_program, startup_program=None):
        switch_startup_program(startup_program)
-def get_var(name, program=None):
+def _get_var(name, program=None):
    """
    Get a variable by name from the global block of a program.

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -21,7 +21,7 @@ from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from .ops import logical_and, logical_not, logical_or
+from .nn import logical_and, logical_not, logical_or
 import numpy
 import warnings
 import six

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -42,19 +42,11 @@ __all__ = [
    'roi_perspective_transform',
    'generate_proposal_labels',
    'generate_proposals',
-]
-__auto__ = [
    'iou_similarity',
    'box_coder',
    'polygon_box_transform',
 ]
-__all__ += __auto__
-for _OP in set(__auto__):
-    globals()[_OP] = generate_layer_fn(_OP)
 def rpn_target_assign(bbox_pred,
                      cls_logits,
@@ -308,6 +300,101 @@ def detection_output(loc,
    return nmsed_outs
+@templatedoc()
+def iou_similarity(x, y, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        y(${y_type}): ${y_comment}
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("iou_similarity", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="iou_similarity",
+        inputs={"X": x,
+                "Y": y},
+        attrs={},
+        outputs={"Out": out})
+    return out
+@templatedoc()
+def box_coder(prior_box,
+              prior_box_var,
+              target_box,
+              code_type="encode_center_size",
+              box_normalized=True,
+              name=None):
+    """
+    ${comment}
+    Args:
+        prior_box(${prior_box_type}): ${prior_box_comment}
+        prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
+        target_box(${target_box_type}): ${target_box_comment}
+        code_type(${code_type_type}): ${code_type_comment}
+        box_normalized(${box_normalized_type}): ${box_normalized_comment}
+    Returns:
+        output_box(${output_box_type}): ${output_box_comment}
+    """
+    helper = LayerHelper("box_coder", **locals())
+    if name is None:
+        output_box = helper.create_tmp_variable(dtype=prior_box.dtype)
+    else:
+        output_box = helper.create_variable(
+            name=name, dtype=prior_box.dtype, persistable=False)
+    helper.append_op(
+        type="box_coder",
+        inputs={
+            "PriorBox": prior_box,
+            "PriorBoxVar": prior_box_var,
+            "TargetBox": target_box
+        },
+        attrs={"code_type": code_type,
+               "box_normalized": box_normalized},
+        outputs={"OutputBox": output_box})
+    return output_box
+@templatedoc()
+def polygon_box_transform(input, name=None):
+    """
+    ${comment}
+    Args:
+        input(${input_type}): ${input_comment}
+    Returns:
+        output(${output_type}): ${output_comment}
+    """
+    helper = LayerHelper("polygon_box_transform", **locals())
+    if name is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
+    else:
+        output = helper.create_variable(
+            name=name, dtype=prior_box.input, persistable=False)
+    helper.append_op(
+        type="polygon_box_transform",
+        inputs={"Input": input},
+        attrs={},
+        outputs={"Output": output})
+    return output
 @templatedoc()
 def detection_map(detect_res,
                  label,

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
    return acc_out
-def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
+def auc(input,
+        label,
+        curve='ROC',
+        num_thresholds=2**12 - 1,
+        topk=1,
+        slide_steps=1):
    """
    **Area Under the Curve (AUC) Layer**
@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
        num_thresholds(int): The number of thresholds to use when discretizing
                             the roc curve. Default 200.
        topk(int): only topk number of prediction output will be used for auc.
+        slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
    Returns:
        Variable: A scalar representing the current AUC.
@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
    auc_out = helper.create_tmp_variable(dtype="float64")
    batch_auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+    # for batch auc
+    batch_stat_pos = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+    batch_stat_neg = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+    # for global auc
    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
-    for var in [stat_pos, stat_neg]:
+    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
        helper.set_variable_initializer(
            var, Constant(
                value=0.0, force_cpu=True))
+    # Batch AUC
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg]
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg]
+        })
+    # Global AUC
    helper.append_op(
        type="auc",
        inputs={
@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
            "StatPos": [stat_pos],
            "StatNeg": [stat_neg]
        },
-        attrs={"curve": curve,
+        attrs={
-               "num_thresholds": num_thresholds},
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0
+        },
        outputs={
            "AUC": [auc_out],
-            "BatchAUC": [batch_auc_out],
            "StatPosOut": [stat_pos],
            "StatNegOut": [stat_neg]
        })
-    return auc_out, batch_auc_out, [stat_pos, stat_neg]
+    return auc_out, batch_auc_out, [
+        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    ]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -29,29 +29,127 @@ from .. import unique_name
 from functools import reduce
 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
+    'fc',
-    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
+    'embedding',
-    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
+    'dynamic_lstm',
-    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
+    'dynamic_lstmp',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
+    'dynamic_gru',
-    'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit',
+    'gru_unit',
-    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
+    'linear_chain_crf',
-    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
+    'crf_decoding',
-    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
+    'cos_sim',
-    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
+    'cross_entropy',
-    'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
+    'square_error_cost',
-    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
+    'chunk_eval',
-    'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze',
+    'sequence_conv',
-    'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool',
+    'conv2d',
-    'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear',
+    'conv3d',
-    'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu',
+    'sequence_pool',
-    'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid',
+    'sequence_softmax',
-    'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten',
+    'softmax',
-    'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
+    'pool2d',
-    'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
+    'pool3d',
-    'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
+    'batch_norm',
-    'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
+    'beam_search_decode',
-    'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape'
+    'conv2d_transpose',
+    'conv3d_transpose',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'reduce_prod',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'ctc_greedy_decoder',
+    'edit_distance',
+    'l2_normalize',
+    'matmul',
+    'topk',
+    'warpctc',
+    'sequence_reshape',
+    'transpose',
+    'im2sequence',
+    'nce',
+    'hsigmoid',
+    'beam_search',
+    'row_conv',
+    'multiplex',
+    'layer_norm',
+    'softmax_with_cross_entropy',
+    'smooth_l1',
+    'one_hot',
+    'autoincreased_step_counter',
+    'reshape',
+    'squeeze',
+    'unsqueeze',
+    'lod_reset',
+    'lrn',
+    'pad',
+    'pad_constant_like',
+    'label_smooth',
+    'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'scatter',
+    'sequence_scatter',
+    'random_crop',
+    'mean_iou',
+    'relu',
+    'log',
+    'crop',
+    'rank_loss',
+    'elu',
+    'relu6',
+    'pow',
+    'stanh',
+    'hard_sigmoid',
+    'swish',
+    'prelu',
+    'brelu',
+    'leaky_relu',
+    'soft_relu',
+    'flatten',
+    'sequence_mask',
+    'stack',
+    'pad2d',
+    'unstack',
+    'sequence_enumerate',
+    'expand',
+    'sequence_concat',
+    'scale',
+    'elementwise_add',
+    'elementwise_div',
+    'elementwise_sub',
+    'elementwise_mul',
+    'elementwise_max',
+    'elementwise_min',
+    'elementwise_pow',
+    'uniform_random_batch_size_like',
+    'gaussian_random',
+    'sampling_id',
+    'gaussian_random_batch_size_like',
+    'sum',
+    'slice',
+    'shape',
+    'logical_and',
+    'logical_or',
+    'logical_xor',
+    'logical_not',
+    'clip',
+    'clip_by_norm',
+    'mean',
+    'mul',
+    'sigmoid_cross_entropy_with_logits',
+    'maxout',
 ]
@@ -60,7 +158,6 @@ def fc(input,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
-       use_mkldnn=False,
       act=None,
       is_test=False,
       name=None):
@@ -112,8 +209,6 @@ def fc(input,
            If it is set to None, the bias is initialized zero. Default: None.
        act (str, default None): Activation to be applied to the output of this layer.
        is_test(bool): A flag indicating whether execution is in test phase.
-        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
-            library is installed. Default: False
        name (str, default None): The name of this layer.
    Returns:
@@ -160,7 +255,7 @@ def fc(input,
            type="sum",
            inputs={"X": mul_results},
            outputs={"Out": pre_bias},
-            attrs={"use_mkldnn": use_mkldnn})
+            attrs={"use_mkldnn": False})
    # add bias
    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
    # add activation
@@ -953,8 +1048,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
        soft_label (bool): a flag indicating whether to
                                           interpretate the given labels as soft
                                           labels. Default: `False`.
-        ignore_index (int): Specifies a target value that is ignored and does 
+        ignore_index (int): Specifies a target value that is ignored and does
-                            not contribute to the input gradient. Only valid 
+                            not contribute to the input gradient. Only valid
                            if soft_label is set to False. Default: -100
    Returns:
@@ -1324,7 +1419,6 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           use_cudnn=True,
-           use_mkldnn=False,
           act=None,
           name=None):
    """
@@ -1402,8 +1496,6 @@ def conv2d(input,
        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
-            with mkldnn library. Default: False
        act (str): Activation type. Default: None
        name (str|None): A name for this layer(optional). If set None, the layer
            will be named automatically.
@@ -1476,7 +1568,7 @@ def conv2d(input,
            'dilations': dilation,
            'groups': groups,
            'use_cudnn': use_cudnn,
-            'use_mkldnn': use_mkldnn
+            'use_mkldnn': False
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1494,7 +1586,6 @@ def conv3d(input,
           param_attr=None,
           bias_attr=None,
           use_cudnn=True,
-           use_mkldnn=False,
           act=None,
           name=None):
    """
@@ -1568,7 +1659,6 @@ def conv3d(input,
        bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not.
        act (str): Activation type. Default: None
        name (str|None): A name for this layer(optional). If set None, the layer
            will be named automatically.
@@ -1638,7 +1728,7 @@ def conv3d(input,
            'dilations': dilation,
            'groups': groups,
            'use_cudnn': use_cudnn,
-            'use_mkldnn': use_mkldnn
+            'use_mkldnn': False
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1820,7 +1910,6 @@ def pool2d(input,
           global_pooling=False,
           use_cudnn=True,
           ceil_mode=False,
-           use_mkldnn=False,
           name=None):
    """
    ${comment}
@@ -1838,7 +1927,6 @@ def pool2d(input,
        global_pooling: ${global_pooling_comment}
        use_cudnn: ${use_cudnn_comment}
        ceil_mode: ${ceil_mode_comment}
-        use_mkldnn: ${use_mkldnn_comment}
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.
@@ -1898,7 +1986,7 @@ def pool2d(input,
            "paddings": pool_padding,
            "use_cudnn": use_cudnn,
            "ceil_mode": ceil_mode,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": False
        })
    return pool_out
@@ -1912,7 +2000,6 @@ def pool3d(input,
           global_pooling=False,
           use_cudnn=True,
           ceil_mode=False,
-           use_mkldnn=False,
           name=None):
    """
    This function adds the operator for pooling in 3-dimensions, using the
@@ -1927,7 +2014,6 @@ def pool3d(input,
        global_pooling (bool): ${global_pooling_comment}
        use_cudnn (bool): ${use_cudnn_comment}
        ceil_mode (bool): ${ceil_mode_comment}
-        use_mkldnn (bool): ${use_mkldnn_comment}
        name (str): A name for this layer(optional). If set None, the layer
            will be named automatically.
@@ -1968,7 +2054,7 @@ def pool3d(input,
            "paddings": pool_padding,
            "use_cudnn": use_cudnn,
            "ceil_mode": ceil_mode,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": False
        })
    return pool_out
@@ -1983,7 +2069,6 @@ def batch_norm(input,
               bias_attr=None,
               data_layout='NCHW',
               in_place=False,
-               use_mkldnn=False,
               name=None,
               moving_mean_name=None,
               moving_variance_name=None,
@@ -2025,7 +2110,6 @@ def batch_norm(input,
        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
        data_layout(string, default NCHW): NCHW|NHWC
        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
        name(string, Default None): A name for this layer(optional). If set None, the layer
            will be named automatically.
        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
@@ -2117,7 +2201,7 @@ def batch_norm(input,
            "momentum": momentum,
            "epsilon": epsilon,
            "is_test": is_test,
-            "use_mkldnn": use_mkldnn,
+            "use_mkldnn": False,
            "fuse_with_relu": fuse_with_relu
        })
@@ -2714,20 +2798,20 @@ def sequence_pad(x, pad_value, maxlen=None):
    Args:
        x(Variable): Input variable which should contain lod information.
-        pad_value(Variable): The Variable that holds values that will be fill 
+        pad_value(Variable): The Variable that holds values that will be fill
-            into padded steps. It can be a scalar or a tensor whose shape 
+            into padded steps. It can be a scalar or a tensor whose shape
-            equals to time steps in sequences. If it's a scalar, it will be 
+            equals to time steps in sequences. If it's a scalar, it will be
            automatically broadcasted to the shape of time step.
-        maxlen(int, default None): The length of padded sequences. It can be 
+        maxlen(int, default None): The length of padded sequences. It can be
-            None or any positive int. When it is None, all sequences will be 
+            None or any positive int. When it is None, all sequences will be
-            padded up to the length of the longest one among them; when it a 
+            padded up to the length of the longest one among them; when it a
-            certain positive value, it must be greater than the length of the 
+            certain positive value, it must be greater than the length of the
            longest original sequence."
    Returns:
-        Variable: The padded sequence batch and the original lengths before 
+        Variable: The padded sequence batch and the original lengths before
                  padding. All sequences has the same length.
    Examples:
        .. code-block:: python
@@ -4343,8 +4427,8 @@ def softmax_with_cross_entropy(logits,
            soft_label is set to true, Label is a Tensor<float/double> with
        soft_label (bool): A flag to indicate whether to interpretate the given
            labels as soft labels. By default, `soft_label` is set to False.
-        ignore_index (int): Specifies a target value that is ignored and does 
+        ignore_index (int): Specifies a target value that is ignored and does
-                            not contribute to the input gradient. Only valid 
+                            not contribute to the input gradient. Only valid
                            if soft_label is set to False. Default: -100
    Returns:
@@ -4601,14 +4685,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 def squeeze(input, axes, name=None):
    """
-    Remove single-dimensional entries from the shape of a tensor. Takes a 
+    Remove single-dimensional entries from the shape of a tensor. Takes a
-    parameter axes with a list of axes to squeeze. If axes is not provided, all 
+    parameter axes with a list of axes to squeeze. If axes is not provided, all
-    the single dimensions will be removed from the shape. If an axis is 
+    the single dimensions will be removed from the shape. If an axis is
    selected with shape entry not equal to one, an error is raised.
    Examples:
    Case 1:
-      Given 
+      Given
        X.shape = (1, 3, 1, 5)
      and
        axes = [0]
@@ -4617,11 +4701,11 @@ def squeeze(input, axes, name=None):
      Case 2:
        Given
          X.shape = (1, 3, 1, 5)
-        and 
+        and
          axes = []
        we get:
          Out.shape = (3, 5)
    Args:
        input (Variable): The input variable to be squeezed.
        axes (list): List of integers, indicating the dimensions to be squeezed.
@@ -4651,14 +4735,14 @@ def squeeze(input, axes, name=None):
 def unsqueeze(input, axes, name=None):
    """
-    Insert single-dimensional entries to the shape of a tensor. Takes one 
+    Insert single-dimensional entries to the shape of a tensor. Takes one
-    required argument axes, a list of dimensions that will be inserted. 
+    required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor. 
+    Dimension indices in axes are as seen in the output tensor.
-    For example: 
+    For example:
-      Given a tensor such that tensor with shape [3, 4, 5], 
+      Given a tensor such that tensor with shape [3, 4, 5],
      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
    Args:
        input (Variable): The input variable to be unsqueezed.
        axes (list): List of integers, indicating the dimensions to be inserted.
@@ -5757,39 +5841,39 @@ def pad2d(input,
    Example:
      Given that X is a channel of image from input:
      X = [[1, 2, 3],
           [4, 5, 6]]
      Case 0:
        paddings = [0, 1, 2, 3],
        mode = 'constant'
        pad_value = 0
        Out = [[0, 0, 1, 2, 3, 0, 0, 0]
               [0, 0, 4, 5, 6, 0, 0, 0]
               [0, 0, 0, 0, 0, 0, 0, 0]]
      Case 1:
        paddings = [0, 1, 2, 1],
        mode = 'reflect'
        Out = [[3, 2, 1, 2, 3, 2]
               [6, 5, 4, 5, 6, 5]
               [3, 2, 1, 2, 3, 2]]
      Case 2:
        paddings = [0, 1, 2, 1],
        mode = 'edge'
        Out = [[1, 1, 1, 2, 3, 3]
               [4, 4, 4, 5, 6, 6]
               [4, 4, 4, 5, 6, 6]]
    Args:
        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
        paddings (tuple|list): The padding size. If padding is a tuple, it must
@@ -5988,7 +6072,7 @@ def prelu(x, mode, param_attr=None, name=None):
 		       channel:elements in a channel share same weight
 		       element:each element has a weight
 	name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically. 
+                        will be named automatically.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -6166,10 +6250,10 @@ def flatten(x, axis=1, name=None):
 def sequence_enumerate(input, win_size, pad_value=0, name=None):
    """
    Generate a new sequence for the input index sequence, which enumerates all the
-    sub-sequences with length `win_size` of the input. 
+    sub-sequences with length `win_size` of the input.
    The enumerated sequence has the same 1st dimension with variable `input`, and
    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
    Examples:
    Case 1:
      Input:
@@ -6296,20 +6380,20 @@ def unstack(x, axis=0, num=None):
    **UnStack Layer**
    This layer unstacks input :code:`x` into several tensors along axis.
    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
    If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
    and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
-    raised. 
+    raised.
    Args:
-        x (Variable): Input variable. 
+        x (Variable): Input variable.
        axis (int): The axis along which the input is unstacked.
        num (int|None): The number of output variables.
    Returns:
        list(Variable): The unstacked variables.
    """
    helper = LayerHelper('unstack', **locals())
@@ -6342,21 +6426,21 @@ def expand(x, expand_times, name=None):
    .. code-block:: text
        Input(X) is a 3-D tensor with shape [2, 3, 1]:
                [
                   [[1], [2], [3]],
                   [[4], [5], [6]]
                ]
        Attr(expand_times):  [1, 2, 2]
        Output(Out) is a 3-D tensor with shape [2, 6, 2]:
                [
                    [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
                    [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
                ]
    Args:
        x (Variable): A tensor with rank in [1, 6].
        expand_times (list|tuple): Expand times number for each dimension.
@@ -6432,12 +6516,7 @@ def uniform_random_batch_size_like(input,
 @templatedoc()
-def gaussian_random(shape,
+def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
-                    mean=0.0,
-                    std=1.0,
-                    seed=0,
-                    dtype='float32',
-                    use_mkldnn=False):
    """
    ${comment}
@@ -6447,7 +6526,6 @@ def gaussian_random(shape,
        std (Float): ${std_comment}
        seed (Int): ${seed_comment}
        dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
-        use_mkldnn (Bool): Only used in mkldnn kernel.
    Returns:
        out (Variable): ${out_comment}
@@ -6466,7 +6544,7 @@ def gaussian_random(shape,
            'std': std,
            'seed': seed,
            'dtype': c_dtype,
-            'use_mkldnn': use_mkldnn
+            'use_mkldnn': False
        })
    return out
@@ -6549,13 +6627,12 @@ def gaussian_random_batch_size_like(input,
 @templatedoc()
-def sum(x, use_mkldnn=False):
+def sum(x):
    """
    ${comment}
    Args:
        x (Variable): ${x_comment}
-        use_mkldnn (Bool): ${use_mkldnn_comment}
    Returns:
        out (Variable): ${out_comment}
@@ -6567,7 +6644,7 @@ def sum(x, use_mkldnn=False):
        type='sum',
        inputs={'X': x},
        outputs={'Out': out},
-        attrs={'use_mkldnn': use_mkldnn})
+        attrs={'use_mkldnn': False})
    return out
@@ -6630,14 +6707,12 @@ def _elementwise_op(helper):
    assert y is not None, 'y cannot be None in {}'.format(op_type)
    axis = helper.kwargs.get('axis', -1)
    use_mkldnn = helper.kwargs.get('use_mkldnn', False)
-    out = helper.kwargs.get('out', None)
+    name = helper.kwargs.get('name', None)
-    if out is None:
+    if name is None:
-        name = helper.kwargs.get('name', None)
+        out = helper.create_tmp_variable(dtype=x.dtype)
-        if name is None:
+    else:
-            out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable(
-        else:
+            name=name, dtype=x.dtype, persistable=False)
-            out = helper.create_variable(
-                name=name, dtype=x.dtype, persistable=False)
    helper.append_op(
        type=op_type,
@@ -6650,13 +6725,7 @@ def _elementwise_op(helper):
 @templatedoc()
-def scale(x,
+def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
-          scale=1.0,
-          bias=0.0,
-          bias_after_scale=True,
-          out=None,
-          act=None,
-          name=None):
    """
    ${comment}
@@ -6665,21 +6734,19 @@ def scale(x,
        scale(${scale_type}): ${scale_comment}
        bias(${bias_type}): ${bias_comment}
        bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment}
-        out(Tensor): Output tensor.
        act(basestring|None): Activation applied to the output.
-        name(basestring|None): Name of the output. 
+        name(basestring|None): Name of the output.
    Returns:
        out(${out_type}): ${out_comment}
    """
    helper = LayerHelper('scale', **locals())
-    if out is None:
+    if name is None:
-        if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
-            out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
-        else:
+        out = helper.create_variable(
-            out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
-                name=name, dtype=x.dtype, persistable=False)
    helper.append_op(
        type='scale',
@@ -6693,73 +6760,31 @@ def scale(x,
    return helper.append_activation(out)
-def elementwise_add(x,
+def elementwise_add(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_add', **locals()))
-def elementwise_div(x,
+def elementwise_div(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_div', **locals()))
-def elementwise_sub(x,
+def elementwise_sub(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
-def elementwise_mul(x,
+def elementwise_mul(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
-def elementwise_max(x,
+def elementwise_max(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_max', **locals()))
-def elementwise_min(x,
+def elementwise_min(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_min', **locals()))
-def elementwise_pow(x,
+def elementwise_pow(x, y, axis=-1, act=None, name=None):
-                    y,
-                    out=None,
-                    axis=-1,
-                    use_mkldnn=False,
-                    act=None,
-                    name=None):
    return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
@@ -6771,7 +6796,291 @@ for func in [
    func.__doc__ = _generate_doc_string_(
        op_proto,
        additional_args_lines=[
-            "out (Tensor): The output tensor of elementwise op.",
            "act (basestring|None): Activation applied to the output.",
            "name (basestring|None): Name of the output."
        ])
+def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
+    helper = LayerHelper(op_name, **locals())
+    if binary_op:
+        assert x.dtype == y.dtype
+    if out is None:
+        if name is None:
+            out = helper.create_tmp_variable(dtype=x.dtype)
+        else:
+            out = helper.create_variable(
+                name=name, dtype=x.dtype, persistable=False)
+    if binary_op:
+        helper.append_op(
+            type=op_name, inputs={"X": x,
+                                  "Y": y}, outputs={"Out": out})
+    else:
+        helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
+    return out
+@templatedoc()
+def logical_and(x, y, out=None, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        y(${y_type}): ${y_comment}
+        out(Tensor): Output tensor of logical operation.
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    return _logical_op(
+        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
+@templatedoc()
+def logical_or(x, y, out=None, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        y(${y_type}): ${y_comment}
+        out(Tensor): Output tensor of logical operation.
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    return _logical_op(
+        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
+@templatedoc()
+def logical_xor(x, y, out=None, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        y(${y_type}): ${y_comment}
+        out(Tensor): Output tensor of logical operation.
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    return _logical_op(
+        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
+@templatedoc()
+def logical_not(x, out=None, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        out(Tensor): Output tensor of logical operation.
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    return _logical_op(
+        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False)
+@templatedoc()
+def clip(x, min, max, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        min(${min_type}): ${min_comment}
+        max(${max_type}): ${max_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("clip", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="clip",
+        inputs={"X": x},
+        attrs={"min": min,
+               "max": max},
+        outputs={"Out": out})
+    return out
+@templatedoc()
+def clip_by_norm(x, max_norm, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        max_norm(${max_norm_type}): ${max_norm_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("clip_by_norm", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out})
+    return out
+@templatedoc()
+def mean(x, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("mean", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out})
+    return out
+@templatedoc()
+def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        y(${y_type}): ${y_comment}
+        x_num_col_dims(${x_num_col_dims_type}): ${x_num_col_dims_comment}
+        y_num_col_dims(${y_num_col_dims_type}): ${y_num_col_dims_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("mul", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="mul",
+        inputs={"X": x,
+                "Y": y},
+        attrs={
+            "x_num_col_dims": x_num_col_dims,
+            "y_num_col_dims": y_num_col_dims
+        },
+        outputs={"Out": out})
+    return out
+@templatedoc()
+def sigmoid_cross_entropy_with_logits(x, label, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        label(${label_type}): ${label_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="sigmoid_cross_entropy_with_logits",
+        inputs={"X": x,
+                "Label": label},
+        attrs={},
+        outputs={"Out": out})
+    return out
+@templatedoc()
+def maxout(x, groups, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        groups(${groups_type}): ${groups_comment}
+        name(basestring|None): Name of the output.
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper("maxout", **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(
+        type="maxout",
+        inputs={"X": x},
+        attrs={"groups": groups},
+        outputs={"Out": out})
+    return out
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -35,18 +35,7 @@ __activations_noattr__ = [
    'softsign',
 ]
-__all__ = [
+__all__ = []
-    'mean',
-    'mul',
-    'sigmoid_cross_entropy_with_logits',
-    'clip',
-    'clip_by_norm',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'maxout',
-]
 for _OP in set(__all__):
    globals()[_OP] = generate_layer_fn(_OP)
@@ -56,6 +45,8 @@ for _OP in set(__all__):
 # e.g.: test_program_code.py, test_dist_train.py
 globals()['_scale'] = generate_layer_fn('scale')
+globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 __all__ += __activations_noattr__
 for _OP in set(__activations_noattr__):

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -40,8 +40,7 @@ def simple_img_conv_pool(input,
                         param_attr=None,
                         bias_attr=None,
                         act=None,
-                         use_cudnn=True,
+                         use_cudnn=True):
-                         use_mkldnn=False):
    """
    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
@@ -84,8 +83,6 @@ def simple_img_conv_pool(input,
        act (str): Activation type for Conv2d. Default: None
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
-            with mkldnn library. Default: False
    Return:
        Variable: The result of input after Convolution2d and Pool2d.
@@ -112,8 +109,7 @@ def simple_img_conv_pool(input,
        param_attr=param_attr,
        bias_attr=bias_attr,
        act=act,
-        use_cudnn=use_cudnn,
+        use_cudnn=use_cudnn)
-        use_mkldnn=use_mkldnn)
    pool_out = layers.pool2d(
        input=conv_out,
@@ -122,8 +118,7 @@ def simple_img_conv_pool(input,
        pool_stride=pool_stride,
        pool_padding=pool_padding,
        global_pooling=global_pooling,
-        use_cudnn=use_cudnn,
+        use_cudnn=use_cudnn)
-        use_mkldnn=use_mkldnn)
    return pool_out
@@ -138,8 +133,7 @@ def img_conv_group(input,
                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
                   pool_type="max",
-                   use_cudnn=True,
+                   use_cudnn=True):
-                   use_mkldnn=False):
    """
    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
    and Pool2d. According to the input arguments, img_conv_group will do serials of
@@ -177,8 +171,6 @@ def img_conv_group(input,
            average-pooling. Default :math:`max`.
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
-            with mkldnn library. Default: False
    Return:
        Variable: The final result after serial computation using Convolution2d,
@@ -226,8 +218,7 @@ def img_conv_group(input,
            padding=conv_padding[i],
            param_attr=param_attr[i],
            act=local_conv_act,
-            use_cudnn=use_cudnn,
+            use_cudnn=use_cudnn)
-            use_mkldnn=use_mkldnn)
        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
@@ -240,8 +231,7 @@ def img_conv_group(input,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
-        use_cudnn=use_cudnn,
+        use_cudnn=use_cudnn)
-        use_mkldnn=use_mkldnn)
    return pool_out

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper
 from .regularizer import append_regularization_ops
 from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
+from .layers import ops
 __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer):
            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
        sum = layers.cast(
            x=sum, dtype='float32' if self._dtype == None else self._dtype)
-        layers.elementwise_div(x=sum, y=tmp, out=param)
+        ops._elementwise_div(x=sum, y=tmp, out=param)
    def _add_average_restore_op(self, block, param_grad):
        param = block._clone_variable(param_grad[0])

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 # default test
-foreach(src ${TEST_OPS})
+if(NOT APPLE)
-    py_test(${src} SRCS ${src}.py)
+    foreach(src ${TEST_OPS})
-endforeach()
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_recognize_digits_conv")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        else()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import framework, unique_name, layer_helper
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print
-class TestRoutineOp(unittest.TestCase):
-    def test_simple_routine(self):
-        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
-        # variable will be filled in and returned by fluid.channel_recv
-        result = self._create_tensor('return_value',
-                                     core.VarDesc.VarType.LOD_TENSOR,
-                                     core.VarDesc.VarType.INT64)
-        with fluid.Go():
-            input_value = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
-            fluid.channel_send(ch, input_value)
-        result, status = fluid.channel_recv(ch, result)
-        fluid.channel_close(ch)
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        outs = exe.run(fetch_list=[result])
-        self.assertEqual(outs[0], 1234)
-    def test_daisy_chain(self):
-        '''
-        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
-        '''
-        n = 100
-        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-        left = leftmost
-        # TODO(thuan): Use fluid.While() after scope capture is implemented.
-        # https://github.com/PaddlePaddle/Paddle/issues/8502
-        for i in range(n):
-            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            with fluid.Go():
-                one_tensor = self._create_one_dim_tensor(1)
-                result = self._create_tensor('return_value',
-                                             core.VarDesc.VarType.LOD_TENSOR,
-                                             core.VarDesc.VarType.INT64)
-                result, status = fluid.channel_recv(right, result)
-                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
-                fluid.channel_send(left, one_added)
-            left = right
-        # Trigger the channel propagation by sending a "1" to rightmost channel
-        with fluid.Go():
-            one_tensor = self._create_one_dim_tensor(1)
-            fluid.channel_send(right, one_tensor)
-        leftmost_result = self._create_tensor('return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.INT64)
-        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        leftmost_data = exe.run(fetch_list=[leftmost_result])
-        # The leftmost_data should be equal to the number of channels + 1
-        self.assertEqual(leftmost_data[0][0], n + 1)
-    def _create_one_dim_tensor(self, value):
-        one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value)
-        one_dim_tensor.stop_gradient = True
-        return one_dim_tensor
-    def _create_tensor(self, name, type, dtype):
-        return framework.default_main_program().current_block().create_var(
-            name=unique_name.generate(name), type=type, dtype=dtype)
-    def _create_persistable_tensor(self, name, type, dtype):
-        return framework.default_main_program().current_block().create_var(
-            name=unique_name.generate(name),
-            type=type,
-            dtype=dtype,
-            persistable=True)
-    def test_select(self):
-        with framework.program_guard(framework.Program()):
-            ch1 = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-            result1 = self._create_tensor('return_value',
-                                          core.VarDesc.VarType.LOD_TENSOR,
-                                          core.VarDesc.VarType.FP64)
-            input_value = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=10)
-            with fluid.Select() as select:
-                with select.case(fluid.channel_send, ch1, input_value):
-                    # Execute something.
-                    pass
-                with select.default():
-                    pass
-            # This should not block because we are using a buffered channel.
-            result1, status = fluid.channel_recv(ch1, result1)
-            fluid.channel_close(ch1)
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-            result = exe.run(fetch_list=[result1])
-            self.assertEqual(result[0][0], 10)
-    def test_fibonacci(self):
-        """
-        Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5
-        """
-        with framework.program_guard(framework.Program()):
-            quit_ch_input_var = self._create_persistable_tensor(
-                'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR,
-                core.VarDesc.VarType.INT32)
-            quit_ch_input = fill_constant(
-                shape=[1],
-                dtype=core.VarDesc.VarType.INT32,
-                value=0,
-                out=quit_ch_input_var)
-            result = self._create_persistable_tensor(
-                'result', core.VarDesc.VarType.LOD_TENSOR,
-                core.VarDesc.VarType.INT32)
-            fill_constant(
-                shape=[1],
-                dtype=core.VarDesc.VarType.INT32,
-                value=0,
-                out=result)
-            x = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-            y = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
-            while_cond = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
-            while_false = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
-            x_tmp = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-            def fibonacci(channel, quit_channel):
-                while_op = While(cond=while_cond)
-                with while_op.block():
-                    result2 = fill_constant(
-                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-                    with fluid.Select() as select:
-                        with select.case(
-                                fluid.channel_send, channel, x, is_copy=True):
-                            assign(input=x, output=x_tmp)
-                            assign(input=y, output=x)
-                            assign(elementwise_add(x=x_tmp, y=y), output=y)
-                        with select.case(fluid.channel_recv, quit_channel,
-                                         result2):
-                            # Quit
-                            helper = layer_helper.LayerHelper('assign')
-                            helper.append_op(
-                                type='assign',
-                                inputs={'X': [while_false]},
-                                outputs={'Out': [while_cond]})
-            ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            with fluid.Go():
-                for i in range(10):
-                    fluid.channel_recv(ch1, result)
-                    Print(result)
-                fluid.channel_send(quit_ch, quit_ch_input)
-            fibonacci(ch1, quit_ch)
-            fluid.channel_close(ch1)
-            fluid.channel_close(quit_ch)
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-            exe_result = exe.run(fetch_list=[result])
-            self.assertEqual(exe_result[0][0], 34)
-    def test_ping_pong(self):
-        """
-        Mimics Ping Pong example: https://gobyexample.com/channel-directions
-        """
-        with framework.program_guard(framework.Program()):
-            result = self._create_tensor('return_value',
-                                         core.VarDesc.VarType.LOD_TENSOR,
-                                         core.VarDesc.VarType.FP64)
-            ping_result = self._create_tensor('ping_return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.FP64)
-            def ping(ch, message):
-                fluid.channel_send(ch, message, is_copy=True)
-            def pong(ch1, ch2):
-                fluid.channel_recv(ch1, ping_result)
-                fluid.channel_send(ch2, ping_result, is_copy=True)
-            pings = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-            pongs = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-            msg = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=9)
-            ping(pings, msg)
-            pong(pings, pongs)
-            fluid.channel_recv(pongs, result)
-            fluid.channel_close(pings)
-            fluid.channel_close(pongs)
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-            exe_result = exe.run(fetch_list=[result])
-            self.assertEqual(exe_result[0][0], 9)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,6 +17,9 @@ if(NOT WITH_DISTRIBUTE)
    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290

--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import dist_ctr_reader
+from test_dist_base import TestDistRunnerBase, runtime_main
+IS_SPARSE = True
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+class TestDistCTR2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
+        """ network definition """
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=0,
+            append_batch_size=False)
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+        for i, dim in enumerate(dnn_layer_dims[1:]):
+            fc = fluid.layers.fc(
+                input=dnn_out,
+                size=dim,
+                act="relu",
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.01)),
+                name='dnn-fc-%d' % i)
+            dnn_out = fc
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
+                                                              label=label)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        inference_program = paddle.fluid.default_main_program().clone()
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+        sgd_optimizer.minimize(avg_cost)
+        dataset = dist_ctr_reader.Dataset()
+        train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
+        test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
+        return inference_program, avg_cost, train_reader, test_reader, None, predict
+if __name__ == "__main__":
+    runtime_main(TestDistCTR2x2)
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import paddle
+import tarfile
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
+DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
+"""
+avazu_ctr_data/train.txt
+avazu_ctr_data/infer.txt
+avazu_ctr_data/test.txt
+avazu_ctr_data/data.meta.txt
+"""
+def read_data(file_name):
+    path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
+    tar = tarfile.open(path, "r:gz")
+    tar_info = None
+    for member in tar.getmembers():
+        if member.name.endswith(file_name):
+            tar_info = member
+    f = tar.extractfile(tar_info)
+    ret_lines = [_.decode('utf-8') for _ in f.readlines()]
+    return ret_lines
+class TaskMode:
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    INFER_MODE = 2
+    def __init__(self, mode):
+        self.mode = mode
+    def is_train(self):
+        return self.mode == self.TRAIN_MODE
+    def is_test(self):
+        return self.mode == self.TEST_MODE
+    def is_infer(self):
+        return self.mode == self.INFER_MODE
+    @staticmethod
+    def create_train():
+        return TaskMode(TaskMode.TRAIN_MODE)
+    @staticmethod
+    def create_test():
+        return TaskMode(TaskMode.TEST_MODE)
+    @staticmethod
+    def create_infer():
+        return TaskMode(TaskMode.INFER_MODE)
+class ModelType:
+    CLASSIFICATION = 0
+    REGRESSION = 1
+    def __init__(self, mode):
+        self.mode = mode
+    def is_classification(self):
+        return self.mode == self.CLASSIFICATION
+    def is_regression(self):
+        return self.mode == self.REGRESSION
+    @staticmethod
+    def create_classification():
+        return ModelType(ModelType.CLASSIFICATION)
+    @staticmethod
+    def create_regression():
+        return ModelType(ModelType.REGRESSION)
+def load_dnn_input_record(sent):
+    return list(map(int, sent.split()))
+def load_lr_input_record(sent):
+    res = []
+    for _ in [x.split(':') for x in sent.split()]:
+        res.append(int(_[0]))
+    return res
+feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
+class Dataset(object):
+    def train(self):
+        '''
+        Load trainset.
+        '''
+        file_name = "train.txt"
+        logger.info("load trainset from %s" % file_name)
+        mode = TaskMode.create_train()
+        return self._parse_creator(file_name, mode)
+    def test(self):
+        '''
+        Load testset.
+        '''
+        file_name = "test.txt"
+        logger.info("load testset from %s" % file_name)
+        mode = TaskMode.create_test()
+        return self._parse_creator(file_name, mode)
+    def infer(self):
+        '''
+        Load infer set.
+        '''
+        file_name = "infer.txt"
+        logger.info("load inferset from %s" % file_name)
+        mode = TaskMode.create_infer()
+        return self._parse_creator(file_name, mode)
+    def _parse_creator(self, file_name, mode):
+        '''
+        Parse dataset.
+        '''
+        def _parse():
+            data = read_data(file_name)
+            for line_id, line in enumerate(data):
+                fs = line.strip().split('\t')
+                dnn_input = load_dnn_input_record(fs[0])
+                lr_input = load_lr_input_record(fs[1])
+                if not mode.is_infer():
+                    click = int(fs[2])
+                    yield [dnn_input, lr_input, click]
+                else:
+                    yield [dnn_input, lr_input]
+        return _parse
+def load_data_meta():
+    '''
+    load data meta info from path, return (dnn_input_dim, lr_input_dim)
+    '''
+    lines = read_data('data.meta.txt')
+    err_info = "wrong meta format"
+    assert len(lines) == 2, err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
+        1], err_info
+    res = map(int, [_.split(':')[1] for _ in lines])
+    res = list(res)
+    logger.info('dnn input dim: %d' % res[0])
+    logger.info('lr input dim: %d' % res[1])
+    return res
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -47,7 +47,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.3)))
+            value=0.01)))
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
@@ -56,7 +56,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.2)))
+            value=0.01)))
    SIZE = 10
    input_shape = conv_pool_2.shape
@@ -68,7 +68,7 @@ def cnn_model(data):
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)))
+            initializer=fluid.initializer.Constant(value=0.01)))
    return predict

--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -247,7 +247,7 @@ class DistSeResneXt2x2(TestDistRunnerBase):
        # Reader
        train_reader = paddle.batch(
-            paddle.dataset.flowers.train(), batch_size=batch_size)
+            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)

--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import argparse
+import time
+import math
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+DTYPE = "int64"
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
+DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+def get_acc(cos_q_nt, cos_q_pt, batch_size):
+    cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+    cond = fluid.layers.cast(cond, dtype='float64')
+    cond_3 = fluid.layers.reduce_sum(cond)
+    acc = fluid.layers.elementwise_div(
+        cond_3,
+        fluid.layers.fill_constant(
+            shape=[1], value=batch_size * 1.0, dtype='float64'),
+        name="simnet_acc")
+    return acc
+def get_loss(cos_q_pt, cos_q_nt):
+    loss_op1 = fluid.layers.elementwise_sub(
+        fluid.layers.fill_constant_batch_size_like(
+            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
+        cos_q_pt)
+    loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+    loss_op3 = fluid.layers.elementwise_max(
+        fluid.layers.fill_constant_batch_size_like(
+            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+        loss_op2)
+    avg_cost = fluid.layers.mean(loss_op3)
+    return avg_cost
+def get_optimizer():
+    # SGD optimizer
+    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    return optimizer
+def train_network(batch_size, is_distributed=False, is_sparse=False):
+    # query
+    q = fluid.layers.data(
+        name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    q_emb = fluid.layers.embedding(
+        input=q,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+    q_ss = fluid.layers.softsign(q_sum)
+    ## fc layer after conv
+    q_fc = fluid.layers.fc(
+        input=q_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__q_fc__",
+            learning_rate=base_lr))
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # pt
+    pt = fluid.layers.data(
+        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    pt_emb = fluid.layers.embedding(
+        input=pt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+    pt_ss = fluid.layers.softsign(pt_sum)
+    ## fc layer
+    pt_fc = fluid.layers.fc(
+        input=pt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    # nt
+    nt = fluid.layers.data(
+        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    nt_emb = fluid.layers.embedding(
+        input=nt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+    nt_ss = fluid.layers.softsign(nt_sum)
+    ## fc layer
+    nt_fc = fluid.layers.fc(
+        input=nt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+    cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+    # loss
+    avg_cost = get_loss(cos_q_pt, cos_q_nt)
+    # acc
+    acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+    return [avg_cost, acc, cos_q_pt]
+def combination(x, y):
+    res = [[[xi, yi] for yi in y] for xi in x]
+    return res[0]
+def get_one_data(file_list):
+    for file in file_list:
+        contents = []
+        with open(file, "r") as fin:
+            for i in fin:
+                contents.append(i.strip())
+            for index, q in enumerate(contents):
+                try:
+                    one_data = [[int(j) for j in i.split(" ")]
+                                for i in q.split(";")[:-1]]
+                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
+                        q = fin.readline()
+                        continue
+                    tmp = combination(one_data[3:3 + one_data[1][0]],
+                                      one_data[3 + one_data[1][0]:])
+                except Exception as e:
+                    continue
+                for each in tmp:
+                    yield [one_data[2], 0, each[0], each[1]]
+def get_batch_reader(file_list, batch_size):
+    def batch_reader():
+        res = []
+        for i in get_one_data(file_list):
+            if random.random() <= sample_rate:
+                res.append(i)
+            if len(res) >= batch_size:
+                yield res
+                res = []
+    return batch_reader
+def get_train_reader(batch_size):
+    # The training data set.
+    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
+                              "train")
+    train_reader = get_batch_reader([train_file], batch_size)
+    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
+    return train_reader, train_feed
+class TestDistSimnetBow2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Train program
+        avg_cost, acc, predict = \
+            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = get_optimizer()
+        opt.minimize(avg_cost)
+        # Reader
+        train_reader, _ = get_train_reader(batch_size)
+        return inference_program, avg_cost, train_reader, train_reader, acc, predict
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSimnetBow2x2)
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+import six
+import tarfile
+import string
+import re
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+DTYPE = "float32"
+VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
+VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
+DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
+# Load dictionary.
+def load_vocab(filename):
+    vocab = {}
+    if six.PY2:
+        with open(filename, 'r') as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    else:
+        with open(filename, 'r', encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    return vocab
+def get_worddict(dict_path):
+    word_dict = load_vocab(dict_path)
+    word_dict["<unk>"] = len(word_dict)
+    dict_dim = len(word_dict)
+    return word_dict, dict_dim
+def conv_net(input,
+             dict_dim,
+             emb_dim=128,
+             window_size=3,
+             num_filters=128,
+             fc0_dim=96,
+             class_dim=2):
+    emb = fluid.layers.embedding(
+        input=input,
+        size=[dict_dim, emb_dim],
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=num_filters,
+        filter_size=window_size,
+        act="tanh",
+        pool_type="max",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    fc_0 = fluid.layers.fc(
+        input=[conv_3],
+        size=fc0_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    prediction = fluid.layers.fc(
+        input=[fc_0],
+        size=class_dim,
+        act="softmax",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    return prediction
+def inference_network(dict_dim):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    out = conv_net(data, dict_dim)
+    return out
+def get_reader(word_dict, batch_size):
+    # The training data set.
+    train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
+    # The testing data set.
+    test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
+    return train_reader, test_reader
+def get_optimizer(learning_rate):
+    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
+    return optimizer
+class TestDistTextClassification2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        vocab = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "text_classification", "imdb.vocab")
+        word_dict, dict_dim = get_worddict(vocab)
+        # Input data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        # Train program
+        predict = conv_net(data, dict_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = get_optimizer(learning_rate=0.001)
+        opt.minimize(avg_cost)
+        # Reader
+        train_reader, test_reader = get_reader(word_dict, batch_size)
+        return inference_program, avg_cost, train_reader, test_reader, acc, predict
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+    with tarfile.open(
+            paddle.dataset.common.download(DATA_URL, 'text_classification',
+                                           DATA_MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+    return reader
+def train(word_idx):
+    """
+    IMDB training set creator.
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("train/pos/.*\.txt$"),
+        re.compile("train/neg/.*\.txt$"), word_idx)
+def test(word_idx):
+    """
+    IMDB test set creator.
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("test/pos/.*\.txt$"),
+        re.compile("test/neg/.*\.txt$"), word_idx)
+if __name__ == "__main__":
+    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
+    runtime_main(TestDistTextClassification2x2)
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size,
    if weight_sharing:
        predict = layers.matmul(
            x=dec_output,
-            y=fluid.get_var(word_emb_param_names[0]),
+            y=fluid.framework._get_var(word_emb_param_names[0]),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)
-    def run_trainer(self, use_cuda, args):
+    def run_trainer(self, args):
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        TrainTaskConfig.use_gpu = args.use_cuda
-        TrainTaskConfig.use_gpu = use_cuda
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
            args.is_dist, not args.sync_mode)
        if args.is_dist:
@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
            TrainTaskConfig.batch_size = 20
            trainer_prog = fluid.default_main_program()
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
        startup_exe = fluid.Executor(place)
        TrainTaskConfig.local = not args.is_dist

--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
 if __name__ == "__main__":
+    import os
+    os.environ['CPU_NUM'] = '1'
+    os.environ['USE_CUDA'] = "FALSE"
    runtime_main(TestDistWord2vec2x2)
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
            "StatPos": stat_pos,
            "StatNeg": stat_neg
        }
-        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
+        self.attrs = {
+            'curve': 'ROC',
+            'num_thresholds': num_thresholds,
+            "slide_steps": 1
+        }
        python_auc = metrics.Auc(name="auc",
                                 curve='ROC',
@@ -45,7 +49,6 @@ class TestAucOp(OpTest):
        self.outputs = {
            'AUC': np.array(python_auc.eval()),
-            'BatchAUC': np.array(python_auc.eval()),
            'StatPosOut': np.array(python_auc._stat_pos),
            'StatNegOut': np.array(python_auc._stat_neg)
        }

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
    def setUp(self):
        self.op_type = "conv2d"
        self.use_cudnn = False
+        self.use_cuda = False
        self.use_mkldnn = False
        self.data_format = "AnyLayout"
        self.dtype = np.float32
@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
        }
        self.outputs = {'Output': output}
-    def testcudnn(self):
+    def testcuda(self):
-        return core.is_compiled_with_cuda() and self.use_cudnn
+        return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                 self.use_cuda)
    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_output_with_place(place, atol=1e-5)
    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
    def test_check_grad_no_filter(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Input'],
            'Output',
@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad_no_input(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Filter'],
            'Output',
@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
 class TestDepthwiseConv(TestConv2dOp):
    def init_test_case(self):
+        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [2, 2]
        self.input_size = [2, 3, 5, 5]  # NCHW
        self.groups = 3
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
+        self.filter_size = [3, f_c, 3, 3]
        self.op_type = "depthwise_conv2d"
 class TestDepthwiseConv2(TestConv2dOp):
    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+class TestDepthwiseConv3(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [1, 1]
        self.input_size = [2, 3, 5, 5]  # NCHW
@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
        self.op_type = "depthwise_conv2d"
+class TestDepthwiseConvWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+class TestDepthwiseConvWithDilation2(TestConv2dOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -18,23 +18,27 @@ import time
 import unittest
 import os
 import sys
-import six
 import signal
 import subprocess
+import six
 import argparse
+import paddle.fluid as fluid
+RUN_STEP = 10
 class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")
-    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
+    @staticmethod
-                       trainers, sync_mode):
+    def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
+                       sync_mode):
        # NOTE: import fluid until runtime, or else forking processes will cause error.
-        import paddle
+        config = fluid.DistributeTranspilerConfig()
-        import paddle.fluid as fluid
+        t = fluid.DistributeTranspiler(config=config)
-        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
@@ -44,11 +48,9 @@ class TestDistRunnerBase(object):
        return t
    def run_pserver(self, args):
-        import paddle
-        import paddle.fluid as fluid
        self.get_model(batch_size=2)
-        if args.mem_opt:
+        # NOTE: pserver should not call memory optimize
-            fluid.memory_optimize(fluid.default_main_program())
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
@@ -61,29 +63,34 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)
-    def run_trainer(self, use_cuda, args):
+    def run_trainer(self, args):
-        import paddle
-        import paddle.fluid as fluid
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)
        if args.mem_opt:
-            fluid.memory_optimize(fluid.default_main_program())
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())
        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
        build_stra = fluid.BuildStrategy()
        if args.use_reduce:
@@ -92,7 +99,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        exe = fluid.ParallelExecutor(
-            use_cuda,
+            args.use_cuda,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
@@ -103,27 +110,26 @@ class TestDistRunnerBase(object):
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = test_reader()
+        reader_generator = train_reader()
-        data = next(reader_generator)
+        def get_data():
-        first_loss, = exe.run(fetch_list=[avg_cost.name],
+            origin_batch = next(reader_generator)
-                              feed=feeder.feed(data))
+            if args.is_dist and args.use_reader_alloc:
-        print(first_loss)
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
-        for i in six.moves.xrange(5):
+        for _ in six.moves.xrange(RUN_STEP):
-            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name],
-            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+                            feed=feeder.feed(get_data()))
+            print(loss)
-        data = next(reader_generator)
-        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
-        print(last_loss)
 def runtime_main(test_class):
-    import paddle
-    import paddle.fluid as fluid
-    import paddle.fluid.core as core
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
@@ -135,7 +141,10 @@ def runtime_main(test_class):
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
+    parser.add_argument('--use_cuda', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')
+    parser.add_argument(
+        '--use_reader_alloc', action='store_true', required=False, default=True)
    args = parser.parse_args()
@@ -143,8 +152,7 @@ def runtime_main(test_class):
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
    else:
-        use_cuda = True if core.is_compiled_with_cuda() else False
+        model.run_trainer(args)
-        model.run_trainer(use_cuda, args)
 import paddle.compat as cpt
@@ -156,6 +164,17 @@ class TestDistBase(unittest.TestCase):
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")
+    def _after_setup_config(self):
+        if self._enforce_place == "CPU":
+            self.__use_cuda = False
+        elif self._enforce_place == "GPU":
+            self.__use_cuda = True
+        else:
+            if fluid.core.is_compiled_with_cuda():
+                self.__use_cuda = True
+            else:
+                self.__use_cuda = False
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
@@ -163,24 +182,27 @@ class TestDistBase(unittest.TestCase):
            self._find_free_port(), self._find_free_port())
        self._python_interp = "python"
        self._sync_mode = True
+        self._enforce_place = None
        self._mem_opt = False
        self._use_reduce = False
+        self._use_reader_alloc = True
        self._setup_config()
+        self._after_setup_config()
    def _find_free_port(self):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
            s.bind(('', 0))
            return s.getsockname()[1]
-    def start_pserver(self, model_file, check_error_log):
+    def start_pserver(self, model_file, check_error_log, required_envs):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
        ps0_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+                  (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
+                   self._trainers)
        ps1_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+                  (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+                   self._trainers)
        if self._sync_mode:
            ps0_cmd += " --sync_mode"
@@ -189,23 +211,23 @@ class TestDistBase(unittest.TestCase):
            ps0_cmd += " --mem_opt"
            ps1_cmd += " --mem_opt"
-        ps0_pipe = subprocess.PIPE
+        print(ps0_cmd)
-        ps1_pipe = subprocess.PIPE
+        print(ps1_cmd)
-        if check_error_log:
+        ps0_pipe = open("/tmp/ps0_err.log", "wb")
-            print(ps0_cmd)
+        ps1_pipe = open("/tmp/ps1_err.log", "wb")
-            print(ps1_cmd)
-            ps0_pipe = open("/tmp/ps0_err.log", "wb")
-            ps1_pipe = open("/tmp/ps1_err.log", "wb")
        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
-        if not check_error_log:
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
-            return ps0_proc, ps1_proc, None, None
-        else:
-            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
    def _wait_ps_ready(self, pid):
        retry_times = 50
@@ -222,59 +244,59 @@ class TestDistBase(unittest.TestCase):
                                 (e, retry_times))
                retry_times -= 1
-    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
+    def _run_local(self, model, envs, check_error_log):
-        # TODO(typhoonzero): should auto adapt GPU count on the machine.
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1",
-            "CPU_NUM": "1"
-        }
-        if check_error_log:
+        cmd = "%s %s --role trainer" % (self._python_interp, model)
-            required_envs["GLOG_v"] = "7"
-            required_envs["GLOG_logtostderr"] = "1"
-        # Run local to get a base line
+        if self.__use_cuda:
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+            cmd += " --use_cuda"
-        env_local.update(required_envs)
+            env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
-        if not check_error_log:
-            local_proc = subprocess.Popen(
-                local_cmd.split(" "),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=env_local)
        else:
+            env_local = {'CPU_NUM': '1'}
+        envs.update(env_local)
+        if check_error_log:
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
-                local_cmd.split(" "),
+                cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
-                env=env_local)
+                env=envs)
+        else:
+            local_proc = subprocess.Popen(
+                cmd.split(" "),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=envs)
-        local_proc.wait()
+        local_out, local_err = local_proc.communicate()
-        out, err = local_proc.communicate()
+        local_ret = cpt.to_text(local_out)
-        local_ret = cpt.to_text(out)
-        sys.stderr.write('local_loss: %s\n' % local_ret)
+        if check_error_log:
-        sys.stderr.write('local_stderr: %s\n' % err)
+            err_log.close()
+        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % local_err)
+        local_losses = local_ret.split("\n")
+        return local_losses
+    def _run_cluster(self, model, envs, check_error_log):
        # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
-                                                          check_error_log)
+                                                          check_error_log, envs)
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
        tr0_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
+                  (self._python_interp, model, self._ps_endpoints,
-             0, ps0_ep, self._trainers)
+                   0, ps0_ep, self._trainers)
        tr1_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
+                  (self._python_interp, model, self._ps_endpoints,
-             1, ps1_ep, self._trainers)
+                   1, ps1_ep, self._trainers)
        if self._sync_mode:
            tr0_cmd += " --sync_mode"
@@ -285,20 +307,25 @@ class TestDistBase(unittest.TestCase):
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
+        if self._use_reader_alloc:
+            tr0_cmd += " --use_reader_alloc"
+            tr1_cmd += " --use_reader_alloc"
+        if self.__use_cuda:
+            tr0_cmd += " --use_cuda"
+            tr1_cmd += " --use_cuda"
+            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        else:
+            env0 = {'CPU_NUM': '1'}
+            env1 = {'CPU_NUM': '1'}
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env0.update(envs)
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env1.update(envs)
-        env0.update(required_envs)
-        env1.update(required_envs)
-        FNULL = open(os.devnull, 'w')
-        tr0_pipe = subprocess.PIPE
+        print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
-        tr1_pipe = subprocess.PIPE
+        print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
-        if check_error_log:
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
-            print("tr0_cmd:", tr0_cmd)
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
-            print("tr1_cmd:", tr1_cmd)
-            tr0_pipe = open("/tmp/tr0_err.log", "wb")
-            tr1_pipe = open("/tmp/tr1_err.log", "wb")
        tr0_proc = subprocess.Popen(
            tr0_cmd.strip().split(" "),
@@ -311,35 +338,65 @@ class TestDistBase(unittest.TestCase):
            stderr=tr1_pipe,
            env=env1)
-        tr0_proc.wait()
+        tr0_out, tr0_err = tr0_proc.communicate()
-        tr1_proc.wait()
+        tr0_loss_text = cpt.to_text(tr0_out)
-        out, err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('dist_stderr: %s\n' % err)
+        tr1_loss_text = cpt.to_text(tr1_out)
-        loss_data0 = cpt.to_text(out)
-        sys.stderr.write('dist_loss: %s\n' % loss_data0)
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
        # close trainer file
-        if check_error_log:
+        tr0_pipe.close()
-            tr0_pipe.close()
+        tr1_pipe.close()
-            tr1_pipe.close()
-            ps0_pipe.close()
+        ps0_pipe.close()
-            ps1_pipe.close()
+        ps1_pipe.close()
        # FIXME: use terminate() instead of sigkill.
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
        ps0.terminate()
        ps1.terminate()
-        ps0.wait()
-        ps1.wait()
-        FNULL.close()
-        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        # print log
-        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
+        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        tr0_losses = tr0_loss_text.split("\n")
+        tr1_losses = tr1_loss_text.split("\n")
+        return tr0_losses, tr1_losses
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1",
+            "http_proxy": ""
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+        local_losses\
+            = self._run_local(model_file, required_envs,
+                                       check_error_log)
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
+                                                   check_error_log)
+        for step_id in range(RUN_STEP):
+            local_loss = eval(local_losses[step_id])[0]
+            tr0_loss = eval(tr0_losses[step_id])[0]
+            tr1_loss = eval(tr1_losses[step_id])[0]
+            dist_loss = (tr0_loss + tr1_loss) / 2
+            print(str(local_loss) + ":" + str(dist_loss))
+            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
@@ -11,31 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+import os
 import unittest
-import paddle.fluid as fluid
+from test_dist_base import TestDistBase
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-class TestRoutineOp(unittest.TestCase):
-    def test_simple_routine(self):
-        ch = fluid.make_channel(
-            dtype=core.VarDesc.VarType.BOOL, name="CreateChannel")
-        with fluid.Go():
-            fluid.channel_send(ch, True)
-        result = fluid.channel_recv(ch)
+class TestDistCTR2x2(TestDistBase):
-        fluid.channel_close(ch)
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        outs = exe.run(fetch_list=[result])
+def test_dist_ctr(self):
-        self.assertEqual(outs[0], True)
+    self.check_with_place("dist_ctr.py", delta=1e-7)
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
        self._use_reduce = False
    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)
 class TestDistMnist2x2WithMemopt(TestDistBase):
@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
        self._mem_opt = True
    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)
 class TestDistMnistAsync(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -20,24 +20,25 @@ from test_dist_base import TestDistBase
 class TestDistSeResneXt2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
+        self._use_reader_alloc = False
    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=1e-7)
+        self.check_with_place("dist_se_resnext.py", delta=100)
-# TODO(typhoonzero): fix this test
+class TestDistseResnXt2x2WithMemopt(TestDistBase):
-# class TestDistseResnXt2x2WithMemopt(TestDistBase):
+    def _setup_config(self):
-#     def _setup_config(self):
+        self._sync_mode = True
-#         self._sync_mode = True
+        self._mem_opt = True
-#         self._mem_opt = True
-#     def test_dist_train(self):
+    def test_dist_train(self):
-#         self.check_with_place("dist_se_resnext.py", delta=1e-7)
+        self.check_with_place("dist_se_resnext.py", delta=100)
 class TestDistSeResneXt2x2Async(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
+        self._use_reader_alloc = False
    def test_dist_train(self):
        self.check_with_place("dist_se_resnext.py", delta=100)

--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import unittest
+from test_dist_base import TestDistBase
+class TestDistSimnetBowDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+class TestDistSimnetBow2x2DenseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+class TestDistSimnetBowSparse2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+class TestDistSimnetBow2x2SparseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import unittest
+from test_dist_base import TestDistBase
+class TestDistTextClassification2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+    def test_text_classification(self):
+        self.check_with_place("dist_text_classification.py", delta=1e-6)
+class TestDistTextClassification2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+    def test_se_resnext(self):
+        self.check_with_place("dist_text_classification.py", delta=100)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
        ])
+class TestDecayedAdagrad(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
+        opt.minimize(avg_cost)
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
 class TestLRDecayConditional(TranspilerTest):
    def net_conf(self):
        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')

--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
        self._sync_mode = False
    def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=1)
+        self.check_with_place("dist_word2vec.py", delta=100)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase):
        mul_op_desc.set_input("X", ["x"])
        mul_op_desc.set_input("Y", ["y"])
        mul_op_desc.set_output("Out", ["out"])
-        mul_op_desc.set_attr("x_num_col_dims", 1)
+        mul_op_desc._set_attr("x_num_col_dims", 1)
-        mul_op_desc.set_attr("y_num_col_dims", 1)
+        mul_op_desc._set_attr("y_num_col_dims", 1)
        mul_op_desc.check_attrs()
        mul_op_desc.infer_shape(block)

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -825,6 +825,15 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(out)
        print(str(program))
+    def iou_similarity(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[16], dtype="float32")
+            y = layers.data(name="y", shape=[16], dtype="float32")
+            out = layers.iou_similarity(x, y, name='iou_similarity')
+            self.assertIsNotNone(out)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase):
        self.assertEqual(['z'], op.output("Out"))
        self.assertEqual(["Out"], op.output_names())
-        op.set_attr("int_attr", 1)
+        op._set_attr("int_attr", 1)
        self.assertEqual(1, op.attr("int_attr"))
        self.assertTrue(op.has_attr("int_attr"))
        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
-        op.set_attr("float_attr", -1.32)
+        op._set_attr("float_attr", -1.32)
        self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
        self.assertTrue(op.has_attr("float_attr"))
-        op.set_attr("bool_attr", False)
+        op._set_attr("bool_attr", False)
        self.assertFalse(op.attr("bool_attr"))
-        op.set_attr("string_attr", "abc")
+        op._set_attr("string_attr", "abc")
        self.assertEqual("abc", op.attr("string_attr"))
        self.assertTrue(op.has_attr("string_attr"))
-        op.set_attr("ints_attr", [1, 2, 3])
+        op._set_attr("ints_attr", [1, 2, 3])
        self.assertEqual([1, 2, 3], op.attr("ints_attr"))
        expected = [1.2, 2.3, 3.4]
-        op.set_attr("floats_attr", expected)
+        op._set_attr("floats_attr", expected)
        for e, a in zip(expected, op.attr("floats_attr")):
            self.assertAlmostEqual(e, a, delta=1e-4)
-        op.set_attr("strings_attr", ["a", "b", "c"])
+        op._set_attr("strings_attr", ["a", "b", "c"])
        self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
-        op.set_attr("bools_attr", [True, False, True])
+        op._set_attr("bools_attr", [True, False, True])
        self.assertEqual([True, False, True], op.attr("bools_attr"))
        self.assertEqual(8, len(op.attr_names()))
-        op.set_block_attr("block_attr", program_desc.block(0))
+        op.set_block_attr("_block_attr", program_desc.block(0))
-        self.assertEqual(0, op.block_attr_id("block_attr"))
+        self.assertEqual(0, op._block_attr_id("_block_attr"))
        mul_op = block.append_op()
        mul_op.set_type("mul")

--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -128,7 +128,7 @@ def op_to_code(op):
        attr_type = op.desc.attr_type(name)
        if attr_type == core.AttrType.BLOCK:
            a = "{name} = block[{value}]".format(
-                name=name, type=attr_type, value=op.block_attr_id(name))
+                name=name, type=attr_type, value=op._block_attr_id(name))
            attrs_str += a
            if i != len(attr_names) - 1:
                attrs_str += ", "
@@ -136,7 +136,7 @@ def op_to_code(op):
        if attr_type == core.AttrType.BLOCKS:
            a = "{name} = blocks{value}".format(
-                name=name, type=attr_type, value=op.blocks_attr_ids(name))
+                name=name, type=attr_type, value=op._blocks_attr_ids(name))
            attrs_str += a
            if i != len(attr_names) - 1:
                attrs_str += ", "

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,8 +39,8 @@ import six
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, Block, \
+    default_startup_program, Block, \
-                        Parameter, grad_var_name
+    Parameter, grad_var_name
 from .details import *
 from functools import reduce
@@ -178,7 +178,7 @@ class DistributeTranspiler(object):
                                                                pserver_program)
           elif role == "TRAINER":
                trainer_program = t.get_trainer_program()
           # for nccl2 mode
           config = fluid.DistributeTranspilerConfig()
           config.mode = "nccl2"
@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
        """
        # remove optimize ops and add a send op to main_program
        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+        lr_ops = self._get_lr_ops()
        delete_ops(self.origin_program.global_block(), self.optimize_ops)
+        delete_ops(self.origin_program.global_block(), lr_ops)
        self.origin_program.__str__()
        if wait_port:
@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
            })
        for varname, splited_var in six.iteritems(self.param_var_mapping):
-            #add concat ops to merge splited parameters received from parameter servers.
+            # add concat ops to merge splited parameters received from parameter servers.
            if len(splited_var) <= 1:
                continue
            # NOTE: if enable memory optimization, origin vars maybe removed.
@@ -668,7 +671,7 @@ in a single call.")
                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
            # reset the block of op
-            op.set_attr('sub_block', new_sub_block)
+            op._set_attr('sub_block', new_sub_block)
        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
@@ -734,19 +737,14 @@ in a single call.")
            table_opt_block = self._create_table_optimize_block(
                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
            optimize_blocks.append(table_opt_block)
-            prefetch_var_name_to_block_id = self._create_prefetch_block(
+            lookup_table_var_name_to_block_id = self._create_prefetch_block(
                pserver_index, pserver_program, table_opt_block)
            checkpoint_block_id = self._create_checkpoint_save_block(
                pserver_program, table_opt_block.idx)
            pserver_program._distributed_lookup_table = self.table_name
+            prefetch_var_name_to_block_id.extend(
-        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
+                lookup_table_var_name_to_block_id)
-        # not be executed, so it's safe to use optimize_block to hold the place
-        if self.has_distributed_lookup_table:
-            assert len(prefetch_var_name_to_block_id) > 0
-        else:
-            assert len(prefetch_var_name_to_block_id) == 0
        attrs = {
            "optimize_blocks": optimize_blocks,
@@ -755,11 +753,14 @@ in a single call.")
            "sync_mode": self.sync_mode,
            "grad_to_block_id": grad_to_block_id,
        }
-        if len(prefetch_var_name_to_block_id) > 0:
-            attrs['prefetch_var_name_to_block_id'] \
+        if self.has_distributed_lookup_table:
-                = prefetch_var_name_to_block_id
            attrs['checkpint_block_id'] = checkpoint_block_id
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs[
+                'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",
@@ -864,7 +865,7 @@ to transpile() call.")
                if op.type in [
                        "gaussian_random", "fill_constant", "uniform_random"
                ]:
-                    op.set_attr("shape", list(new_outputs["Out"].shape))
+                    op._set_attr("shape", list(new_outputs["Out"].shape))
                s_prog.global_block().append_op(
                    type=op.type,
                    inputs=new_inputs,
@@ -1013,7 +1014,7 @@ to transpile() call.")
        for g, p in zip(grad_blocks, param_blocks):
            g_name, g_bid, _ = g.split(":")
            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
                self.param_var_mapping[p_name][int(p_bid)]
        # create mapping of endpoint -> split var to create pserver side program
@@ -1320,7 +1321,7 @@ to transpile() call.")
            if len(splited) == 1:
                if self.sync_mode and add_trainer_suffix:
                    new_var_name = "%s.trainer_%d" % \
-                        (orig_var.name, self.trainer_id)
+                                   (orig_var.name, self.trainer_id)
                    program.global_block()._rename_var(varname, new_var_name)
                    var_mapping[varname] = \
                        [program.global_block().var(new_var_name)]
@@ -1343,10 +1344,10 @@ to transpile() call.")
                new_var_name = ""
                if self.sync_mode and add_trainer_suffix:
                    new_var_name = "%s.block%d.trainer_%d" % \
-                        (varname, i, self.trainer_id)
+                                   (varname, i, self.trainer_id)
                else:
                    new_var_name = "%s.block%d" % \
-                        (varname, i)
+                                   (varname, i)
                var = program.global_block().create_var(
                    name=new_var_name,
                    persistable=False,
@@ -1430,6 +1431,9 @@ to transpile() call.")
        elif op_type == "rmsprop":
            if varkey in ["Moment", "MeanSquare"]:
                return param_shape
+        elif op_type == "decayed_adagrad":
+            if varkey == "Moment":
+                return param_shape
        elif op_type == "sgd":
            pass
        return orig_shape
@@ -1484,9 +1488,8 @@ to transpile() call.")
            vars2merge = []
            for i in range(self.trainer_num):
                per_trainer_name = "%s.trainer_%d" % \
-                (merged_var_name, i)
+                                   (merged_var_name, i)
                vars2merge.append(pserver_block.vars[per_trainer_name])
            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
@@ -1645,7 +1648,7 @@ to transpile() call.")
        # one op's output is another op's input, we say
        # the two operator is connected.
        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
-           set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
+                set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
            return True
        return False
@@ -1662,7 +1665,7 @@ to transpile() call.")
    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
-            "LearningRate" in op.input_names:
+                "LearningRate" in op.input_names:
            return True
        return False
@@ -1737,7 +1740,7 @@ to transpile() call.")
                # NOTE: we need to skip all optimize ops, since it is connected
                # with forward/backward ops and lr ops, we only need the lr ops.
                if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
+                        not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                    ufind.union(op1, op2)
        # find all ops which is related with lr var
        for op1 in block.ops:

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -163,7 +163,7 @@ class InferenceTranspiler(object):
                next_op = self.block.ops[i + 1]
                if next_op.type == 'relu':
                    # modify bnorm OP to include relu
-                    current_op.set_attr("fuse_with_relu", True)
+                    current_op._set_attr("fuse_with_relu", True)
                    # remove relu OP
                    self.block._remove_op(i + 1)
            i = i + 1
@@ -377,7 +377,7 @@ class InferenceTranspiler(object):
                type=old_var.type,
                dtype=old_var.dtype,
                shape=old_var.shape)
-            op.rename_input(old_param_name, new_param_name)
+            op._rename_input(old_param_name, new_param_name)
            self.scope.var(new_param_name)
            tensor = self.scope.find_var(new_param_name).get_tensor()
@@ -463,8 +463,8 @@ class InferenceTranspiler(object):
            current_op = self.block.ops[i]
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
+                    current_op._rename_input(input_arg,
-                                            self.input_map[input_arg])
+                                             self.input_map[input_arg])
    def _remove_unused_var(self):
        '''

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,10 +14,10 @@
 from __future__ import print_function
-from collections import defaultdict, OrderedDict, Callable
+from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
-from ..framework import Program, default_main_program, Parameter, Variable
+from ..framework import Program, default_main_program, Parameter, Variable, core
 from ..backward import _rename_arg_
 from functools import reduce
 from six.moves import range
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
 PRINT_LOG = False
+class OrderedSet(MutableSet):
+    def __init__(self, iterable=None):
+        self.end = end = []
+        end += [None, end, end]  # sentinel node for doubly linked list
+        self.map = {}  # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+    def __len__(self):
+        return len(self.map)
+    def __contains__(self, key):
+        return key in self.map
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+    def update(self, other):
+        for e in other:
+            self.add(e)
+    def discard(self, key):
+        if key in self.map:
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+    def remove(self, key):
+        self.discard(key)
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+    def pop(self, last=True):
+        if not self:
+            raise KeyError('set is empty')
+        key = self.end[1][0] if last else self.end[2][0]
+        self.discard(key)
+        return key
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__, )
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
 class ControlFlowGraph(object):
    def __init__(self, program, ops, forward_num, skip_opt):
        self._program = program
        self._ops = ops
        self._forward_num = forward_num
-        self._successors = defaultdict(set)
+        self._successors = defaultdict(OrderedSet)
-        self._presuccessors = defaultdict(set)
+        self._presuccessors = defaultdict(OrderedSet)
-        self._uses = defaultdict(set)
+        self._uses = defaultdict(OrderedSet)
-        self._defs = defaultdict(set)
+        self._defs = defaultdict(OrderedSet)
-        self._live_in = defaultdict(set)
+        self._live_in = defaultdict(OrderedSet)
-        self._live_out = defaultdict(set)
+        self._live_out = defaultdict(OrderedSet)
        self._skip_opt = skip_opt
        self.pool = []
@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
        # NOTE: must sort the in_diff set for cases that get different cache var.
        # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
        can_optimize = [
-            x for x in sorted(list(in_diff))
+            x for x in in_diff
            if self._check_var_validity(block_desc, x, is_forward)
        ]
        if can_optimize:
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
            if self.pool:
                # NOTE: must sort the in_diff set for cases that get different cache var.
                defs_can_optimize = [
-                    x for x in sorted(list(self._defs[i]))
+                    x for x in self._defs[i]
                    if self._check_var_validity(block_desc, x, is_forward)
                ]
                out_pair = [
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
    return cfgs
-def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
+def _is_opt_role_op(op):
+    op_maker = core.op_proto_and_checker_maker
+    optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+    if op_maker.kOpRoleAttrName() in op.attr_names and \
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        return True
+def memory_optimize(input_program,
+                    skip_opt_set=None,
+                    print_log=False,
+                    level=0,
+                    skip_grads=False):
    """Optimize memory by reusing var memory.
      Note: it doesn't not support subblock nested in subblock.
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
        raise ValueError("only support opt_level 0 or 1.")
    global PRINT_LOG
    PRINT_LOG = print_log
+    if skip_grads:
+        grad_set = set()
+        OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        for op in input_program.global_block().ops:
+            if _is_opt_role_op(op):
+                if op.attr(OP_ROLE_VAR):
+                    grad_name = op.attr(OP_ROLE_VAR)[1]
+                    grad_set.add(grad_name)
+        if not skip_opt_set:
+            skip_opt_set = grad_set
+        else:
+            skip_opt_set.update(grad_set)
    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)