diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ec632e20690eafdc558e24f160270a89b29ee41..e4442d254901e2524385452ebe5ac6f6df3056f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,7 +212,7 @@ endif()
 if (WITH_JEMALLOC)
     find_package(JeMalloc REQUIRED)
     include_directories(${JEMALLOC_INCLUDE_DIR})
-    add_definitions(-DWITH_JEMALLOC)
+    add_definitions(-DPADDLE_WITH_JEMALLOC)
 endif()
 
 include(generic)            # simplify cmake module
@@ -276,9 +276,3 @@ add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
 endif()
-
-if(WITH_DOC)
-    find_package(Sphinx REQUIRED)
-    find_python_module(recommonmark REQUIRED)
-    add_subdirectory(doc)
-endif()
diff --git a/Dockerfile b/Dockerfile
index acfd091265e26d6c29c561d166fed2504c0cff1c..fe0721e9b99b5e028df2f6228ff04cb56a567a3f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 # ENV variables
 ARG WITH_GPU
 ARG WITH_AVX
-ARG WITH_DOC
 
 ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-OFF}
 
 ENV HOME /root
 # Add bash enhancements
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
deleted file mode 100644
index f74cd4ff8c9c2c52319b18ac37264167b3718eae..0000000000000000000000000000000000000000
--- a/cmake/FindSphinx.cmake
+++ /dev/null
@@ -1,147 +0,0 @@
-# - This module looks for Sphinx
-# Find the Sphinx documentation generator
-#
-# This modules defines
-#  SPHINX_EXECUTABLE
-#  SPHINX_FOUND
-
-find_program(SPHINX_EXECUTABLE
-  NAMES sphinx-build
-  PATHS
-    /usr/bin
-    /usr/local/bin
-    /opt/local/bin
-  DOC "Sphinx documentation generator"
-)
-
-if( NOT SPHINX_EXECUTABLE )
-  set(_Python_VERSIONS
-    2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
-  )
-
-  foreach( _version ${_Python_VERSIONS} )
-    set( _sphinx_NAMES sphinx-build-${_version} )
-
-    find_program( SPHINX_EXECUTABLE
-      NAMES ${_sphinx_NAMES}
-      PATHS
-        /usr/bin
-        /usr/local/bin
-        /opt/loca/bin
-      DOC "Sphinx documentation generator"
-    )
-  endforeach()
-endif()
-
-include(FindPackageHandleStandardArgs)
-
-find_package_handle_standard_args(Sphinx DEFAULT_MSG
-  SPHINX_EXECUTABLE
-)
-
-
-option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
-option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
-option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
-option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
-option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
-option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
-option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
-option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
-option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
-
-
-mark_as_advanced(
-  SPHINX_EXECUTABLE
-  SPHINX_HTML_OUTPUT
-  SPHINX_DIRHTML_OUTPUT
-  SPHINX_HTMLHELP_OUTPUT
-  SPHINX_QTHELP_OUTPUT
-  SPHINX_DEVHELP_OUTPUT
-  SPHINX_EPUB_OUTPUT
-  SPHINX_LATEX_OUTPUT
-  SPHINX_MAN_OUTPUT
-  SPHINX_TEXT_OUTPUT
-)
-
-function( Sphinx_add_target target_name builder conf cache source destination )
-  add_custom_target( ${target_name} ALL
-    COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-    -d ${cache}
-    -c ${conf}
-    ${source}
-    ${destination}
-    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
-    )
-
-  set_property(
-    DIRECTORY APPEND PROPERTY
-    ADDITIONAL_MAKE_CLEAN_FILES
-    ${destination}
-    )
-endfunction()
-
-# Target dependencies can be optionally listed at the end.
-function( Sphinx_add_targets target_base_name conf source base_destination )
-
-  set( _dependencies )
-
-  foreach( arg IN LISTS ARGN )
-    set( _dependencies ${_dependencies} ${arg} )
-  endforeach()
-
-  if( ${SPHINX_HTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
-
-    add_dependencies( ${target_base_name}_html ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_DIRHTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
-
-    add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_QTHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
-
-    add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_DEVHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
-
-    add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_EPUB_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
-
-    add_dependencies( ${target_base_name}_epub ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_LATEX_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
-
-    add_dependencies( ${target_base_name}_latex ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_MAN_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
-
-    add_dependencies( ${target_base_name}_man ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_TEXT_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
-
-    add_dependencies( ${target_base_name}_text ${_dependencies} )
-  endif()
-
-  if( ${BUILD_TESTING} )
-    sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
-
-    add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
-  endif()
-endfunction()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1f4dbe0b49825aef9a236f7ae72c6bea168b2ec5..6679a09dfc9dd00cfe3b5c5da3e12bd1c1389432 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 10 minutes.
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
   endif()
 endfunction(nv_test)
@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
     add_test(NAME ${TARGET_NAME}
              COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-             FLAGS_cpu_deterministic=true
+             FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 59c40a0e5d18b753038f2b9301d1c9494e3901be..c2d04828564e69d7ac965881057f185194aa0475 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -52,8 +52,8 @@ function(op_library TARGET)
         endif()
         if(WITH_MKLDNN)
             string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
             endif()
         endif()
     else()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ad39542b4d8356229a3e060bffb45d8ea89f5b70..84b4677777a79b30ba8936025a60e8d6d9186a2c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
@@ -121,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
@@ -141,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
 paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
-paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
-paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1))
+paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -197,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -212,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
@@ -318,10 +320,12 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
 paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
+paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
+paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -357,6 +361,10 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a167511160d074c13ca1dca36b4f2c5eeea4bb93..66f11dedbaccd7febcd75fa7ade9c68b6c42022c 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 #windows treat symbolic file as a real file, which is different with unix
 #We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
-if(WITH_NGRAPH)
-  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-             shape_inference data_transform lod_tensor profiler)
-endif(WITH_NGRAPH)
-
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
 
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
 else()
-  if(WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
-  else(WITH_NGRAPH)
+  if (WITH_NGRAPH)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
+  else ()
     cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(WITH_NGRAPH)
+  endif()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
@@ -214,3 +206,24 @@ endif (NOT WIN32)
 
 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
+
+# Get the current working branch
+execute_process(
+  COMMAND git rev-parse --abbrev-ref HEAD
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_BRANCH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "commit: ${PADDLE_COMMIT}")
+message(STATUS "branch: ${PADDLE_BRANCH}")
+
+configure_file(commit.h.in commit.h)
diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..3a33ece624443a99083ae29abb70254a5ac40a3d
--- /dev/null
+++ b/paddle/fluid/framework/commit.h.in
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+static std::string paddle_commit() {
+  return "@PADDLE_COMMIT@";
+}
+
+static std::string paddle_compile_branch() {
+  return "@PADDLE_BRANCH@";
+}
+
+static std::string paddle_version() {
+  return "@PADDLE_VERSION@";
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 603df2e06936e3d9d8e7ec62efd0c6e83200239c..cd24a3175953bf323748bf0c7e3159761c13f0a9 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -91,7 +91,7 @@ struct BuildStrategy {
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{false};
+  bool remove_unnecessary_lock_{true};
 
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736312b3050debe745f2d3c108469c5d6..318694a1d4b0599655f05bf01c907fb6c07a4193 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,6 +25,9 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
+  // If we set this to 1, we will delete all variables when finish a batch. and
+  // this will loss 15%+ performance.
+  // Please be aware about this parameters.
   size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c93bbe7ceecce9193acfae0b4e03c06212edd6d6..4323883fa5cc9b26a68c2980f3b7a49eca610543 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_NGRAPH
-#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
 #endif
 
 DECLARE_bool(benchmark);
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
   }
 }
 
-static void EnableFusedOp(ExecutorPrepareContext* ctx) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(3) << "use_ngraph=True";
-  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
-  for (auto& interval : intervals) {
-    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
-                                     interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
-  }
-  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    ctx->ops_.erase(it->at(0) + 1, it->at(1));
-  }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
-}
-
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
+#endif
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b118dccd1b3de881b4791bff6cd331726c8e05da..914bcce7755bcf0651da29dd669f5d6d14e081d2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,8 +10,22 @@ function(pass_library TARGET DEST)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
+    set(targetPrefix "")
+
+    # Get optional argument
+    set(extraMacroArgs ${ARGN})
+    list(LENGTH extraMacroArgs numExtraMacroArgs)
+    if(numExtraMacroArgs GREATER 0)
+        list(GET extraMacroArgs 0 targetPrefix)
+    endif()
+
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    if(targetPrefix)
+        cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    else()
+        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    endif()
+
     # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
     if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
         message(STATUS "add pass ${TARGET} ${DEST}")
@@ -62,11 +76,11 @@ foreach (index RANGE 3 6)
 endforeach()
 
 if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base)
-    pass_library(depthwise_conv_mkldnn_pass base)
-    pass_library(conv_bias_mkldnn_fuse_pass inference)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+    pass_library(mkldnn_placement_pass base mkldnn)
+    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
+    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -86,7 +100,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
-    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
-    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 2ee12cc410393d1e1aa5fc9e5374d858eca1b901..929d9edc34ffb92f468d5b7af54a0b8da4121543 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 
+#include <set>
 #include <vector>
 
 namespace paddle {
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
   }
 
   std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+  std::set<Node *> to_visit{source.begin(), source.end()};
 
   std::vector<Node *> inlink_visited;
   while (!to_visit.empty()) {
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index d4a701e0b173a96d8605dff308fee7007a0ecc0c..5d0b294f6fec5f14dcddb91f8ceffb27fc833d4e 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <functional>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index a8029e67e659a269f8492cf6e2f1f09040144283..fb3db81347b102cfa264082b36a2e22ea8c22982 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include <functional>
 #include <list>
 #include <map>
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 61ba097fd8cb55e25bda1947ea97d53308c55bd3..9ef5c298b8cddfec094e9544dc6da9afdcaf0dab 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 #include <string>
 
-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
index e359a3832ee8d549f8c58d63bc1cc6564ecadede..4f4605398a665e63662a64a3a925c32d48f10952 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
index 19248b4dfee1da81d18cd2effac08ba68dde80fb..06d56f6222e4bb9a9969d4ab2d260c97d1ce6c72 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
similarity index 96%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index 19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c..7851e8c84bca2e3b05d3b1603eaa4c0ca5909e10 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index 09d0b15f46a7e50afb6aea46383013ce6a6c6118..1783e3322b1df8125f580f09a12aefe64d246c1a 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn_placement_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index 951fcb066ce759ebfec0182e1e9dca887e343170..20e52410ffe3caa86450bc05bf3aabf5a5bce374 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn_placement_pass.h
rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8fbbc6584e121d22bdec8173d501a35dc97c9c06..f46bdf96ba1e9e1e137c690057051d9a127d45c9 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   if (!platform::is_cpu_place(t.place())) {
-    LoDTensor tt;
-    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+    LoDTensor cpu_tensor;
+    cpu_tensor.set_lod(t.lod());
+    framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(t.place());
     dev_ctx.Wait();
 
-    os << tt;
+    os << cpu_tensor;
     return os;
   }
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index c3a044d22cf04dceecc164fae934ee15c4563af1..5d854cb8d7856a631faf01741d29d3cecfd9a627 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
deleted file mode 100644
index 7e174c7def1ffa4089a94d9cc504b18843557c53..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <map>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "ngraph/ngraph.hpp"
-
-namespace paddle {
-namespace framework {
-
-static ngraph::Shape Ddim2Shape(const DDim& dims) {
-  ngraph::Shape sp;
-  for (int i = 0; i < dims.size(); ++i) {
-    int k = dims[i];
-    k = k == 0 ? 1 : k;
-    sp.push_back(k);
-  }
-  return sp;
-}
-
-static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
-    {proto::VarType::FP32, ngraph::element::f32},
-    {proto::VarType::FP64, ngraph::element::f64},
-    {proto::VarType::INT32, ngraph::element::i32},
-    {proto::VarType::INT64, ngraph::element::i64},
-    {proto::VarType::BOOL, ngraph::element::boolean},
-};
-
-typedef enum {                /* nGraph support state on ops          */
-               FULL_TRAIN,    /* Support full ops for train           */
-               PARTIAL_TRAIN, /* Support partial ops for train        */
-               FULL_TEST,     /* Support full list of ops for test    */
-               PARTIAL_TEST   /* Support partial list of ops for test */
-} op_state;
-
-// perform graph build through bridge and execute computation
-class NgraphEngine {
- public:
-  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
-                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                        const std::unordered_map<
-                            std::string, ngraph::element::Type>& var_type_map,
-                        const std::unordered_set<std::string>& persist,
-                        const std::unordered_set<std::string>& fetches,
-                        const std::unordered_set<std::string>& post_op_inputs,
-                        op_state ng_op_state)
-      : scope_(scope),
-        place_(place),
-        fused_ops_(ops),
-        var_type_map_(var_type_map),
-        persistables_(persist),
-        fetches_(fetches),
-        post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {
-    var_in_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    var_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    BuildNgIO();
-
-    GetNgFunction();
-  }
-
-  void Run(const Scope& scope, const platform::Place& place) const;
-
- private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
-  const Scope& scope_;
-  const platform::Place& place_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  op_state ng_op_state_;
-
-  // ngraph backend eg. CPU
-  static std::shared_ptr<ngraph::runtime::Backend> backend_;
-  // ngraph function to call and execute
-  std::shared_ptr<ngraph::Function> ngraph_function_;
-  // var_name of inputs
-  std::vector<std::string> var_in_;
-  // var_name of outputs from  fetch in order
-  std::vector<std::string> var_out_;
-  // map input vars to nodes
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_in_node_map_;
-  // map each var name with a ngraph node
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_node_map_;
-  // cache key to check if function is cached
-  std::shared_ptr<std::string> GetCacheKey();
-  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
-  // Call ngraph bridge to map ops
-  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
-  // build ngraph function call
-  void BuildNgFunction();
-  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
-};
-
-std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-NgraphOperator::NgraphOpIntervals(
-    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
-  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-      intervals;
-  if (ops->empty()) {
-    return intervals;
-  }
-  size_t size = ops->size();
-  size_t left = 0;
-  while (left < size && ops->at(left)->Type() != kFeedOpType) {
-    ++left;
-  }
-  if (left == size) {
-    return intervals;
-  }
-  while (left < size && ops->at(left)->Type() == kFeedOpType) {
-    ++left;
-  }
-
-  size_t right = left;
-  while (right < size && ops->at(right)->Type() != kFetchOpType) {
-    ++right;
-  }
-  if (right == size) {
-    return intervals;
-  }
-  if (left >= right) return intervals;
-
-  // (left, right - 1) represents indices between feed and fetch
-  size_t pivot = left;
-  while (pivot < right) {
-    auto op_type = ops->at(pivot)->Type();
-    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
-      ++pivot;
-    } else {
-      size_t start = pivot, end = start;
-      while (pivot < right &&
-             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops->at(pivot)->Type()) !=
-              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
-        ++pivot;
-        ++end;
-      }
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
-          interval = {ops->begin() + start, ops->begin() + end};
-      intervals.push_back(interval);
-    }
-  }  // end while
-
-  return intervals;
-}
-
-NgraphOperator::NgraphOperator(
-    const ProgramDesc& prog, size_t block_id,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-    const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs),
-      pdesc_(prog),
-      block_(block_id) {
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
-       it != end; ++it) {
-    fused_ops_.push_back(std::move(*it));
-  }
-
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
-       (*it)->Type() != kFetchOpType; ++it) {
-    for (auto& var_name_item : (*it)->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        post_op_inputs_.insert(var_name);
-      }
-    }
-  }
-
-  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_full_ = true;
-  }
-
-  Process();
-}
-
-void NgraphOperator::Process() {
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto& var : bdesc.AllVars()) {
-    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
-          var->GetType() == proto::VarType::LOD_TENSOR ||
-          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
-      continue;
-    }
-
-    auto var_name = var->Name();
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-
-    if (var_name != "fetch" && var_name != "feed") {
-      auto pd_type = var->GetDataType();
-      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
-        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
-                     var_name);
-      }
-      var_type_map_[var_name] = pd2ng_type_map[pd_type];
-    }
-
-    if (var->Persistable()) {
-      persistables_.insert(var->Name());
-    }
-  }
-
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      fetches_.insert(fetch_target_name);
-    }
-  }
-}
-
-void NgraphOperator::RunImpl(const Scope& scope,
-                             const platform::Place& place) const {
-  op_state ng_op_state = PARTIAL_TEST;
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type().find("_grad") != std::string::npos) {
-      ng_op_state = PARTIAL_TRAIN;
-      break;
-    }
-  }
-
-  if (is_full_) {
-    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
-  }
-
-  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
-                             persistables_, fetches_, post_op_inputs_,
-                             ng_op_state);
-  ngraph_engine.Run(scope, place);
-}
-
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphEngine::func_cache_ = {};
-
-std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
-    ngraph::runtime::Backend::create("CPU");
-
-void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-  op->RuntimeInferShape(scope_, place_, ctx);
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto sp = Ddim2Shape(tensor_pd->dims());
-        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-            var_in_.end()) {
-          if (var_node_map_->find(var_name) == var_node_map_->end()) {
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm =
-                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-            (*var_node_map_)[var_name] = prm;
-            (*var_in_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgNodes() {
-  for (auto& var_name : var_out_) {
-    if (var_node_map_->find(var_name) == var_node_map_->end()) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto& ddim = tensor_pd->dims();
-        auto ng_shape = Ddim2Shape(ddim);
-        auto ng_type = var_type_map_.at(var_name);
-        auto prm =
-            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
-        (*var_node_map_)[var_name] = prm;
-      }
-    }
-  }
-
-  paddle::framework::NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
-  }
-}
-
-void NgraphEngine::BuildNgIO() {
-  std::unordered_set<std::string> inputs;
-  std::unordered_set<std::string> outputs;
-
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        inputs.insert(var_name);
-        const bool is_output = outputs.find(var_name) != outputs.end();
-        if (!is_output &&
-            std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end()) {
-          // fill var_in here to keep lhs and rhs order
-          var_in_.push_back(var_name);
-        }
-      }
-    }
-
-    if (op->Type() != "fill_constant") {
-      GetNgInputShape(op);
-    }
-
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        outputs.insert(var_name);
-      }
-    }
-  }
-
-  // var_out.clear();
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        switch (ng_op_state_) {
-          case PARTIAL_TEST:
-            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TEST:
-            if (fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case PARTIAL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          default:
-            var_out_.push_back(var_name);
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgFunction() {
-  BuildNgNodes();
-  ngraph_function_ = nullptr;
-  ngraph::NodeVector func_outputs;
-  ngraph::ParameterVector func_inputs;
-
-  for (auto& vo : var_out_) {
-    func_outputs.push_back(var_node_map_->at(vo));
-  }
-
-  for (auto& vi : var_in_) {
-    std::shared_ptr<ngraph::op::Parameter> prm =
-        std::dynamic_pointer_cast<ngraph::op::Parameter>(
-            var_in_node_map_->at(vi));
-    func_inputs.push_back(prm);
-  }
-
-  ngraph_function_ =
-      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
-}
-
-std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
-  auto cache_key = std::make_shared<std::string>("");
-  *cache_key += std::to_string(fused_ops_.size());
-  for (auto& op : fused_ops_) {
-    *cache_key += op->Type();
-  }
-  for (auto& var_name : var_in_) {
-    auto shape = var_node_map_->at(var_name)->get_shape();
-    *cache_key += var_name;
-    *cache_key += var_type_map_.at(var_name).c_type_string();
-    for (size_t i = 0; i < shape.size(); ++i) {
-      *cache_key += std::to_string(shape.at(i));
-    }
-  }
-
-  for (auto& var_name : var_out_) {
-    auto* var = scope_.FindVar(var_name);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto& ddim = tensor_pd->dims();
-      for (int i = 0; i < ddim.size(); ++i) {
-        *cache_key += std::to_string(ddim[i]);
-      }
-    }
-  }
-  return cache_key;
-}
-
-void NgraphEngine::GetNgFunction() {
-  bool cache_on = true;
-  if (cache_on) {
-    std::string cache_key_val = *GetCacheKey();
-    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
-      ngraph_function_ = func_cache_.at(cache_key_val);
-    } else {
-      BuildNgFunction();
-      func_cache_[cache_key_val] = ngraph_function_;
-    }
-  } else {
-    BuildNgFunction();
-  }
-}
-
-void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
-
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto vi = var_in_.at(i);
-    auto sp = var_node_map_->at(vi)->get_shape();
-    std::shared_ptr<ngraph::runtime::Tensor> ti;
-    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
-                     "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type() == proto::VarType::FP32) {
-        const float* arr = tensor_pd->data<float>();
-        ti = backend_->create_tensor(ngraph::element::f32, sp,
-                                     const_cast<float*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT32) {
-        const int* arr = tensor_pd->data<int>();
-        ti = backend_->create_tensor(ngraph::element::i32, sp,
-                                     const_cast<int*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT64) {
-        const int64_t* arr = tensor_pd->data<int64_t>();
-        ti = backend_->create_tensor(ngraph::element::i64, sp,
-                                     const_cast<int64_t*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::FP64) {
-        const double* arr = tensor_pd->data<double>();
-        ti = backend_->create_tensor(ngraph::element::f64, sp,
-                                     const_cast<double*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::BOOL) {
-        const bool* arr = tensor_pd->data<bool>();
-        ti = backend_->create_tensor(ngraph::element::boolean, sp,
-                                     const_cast<bool*>(arr));
-      } else {
-        PADDLE_THROW("Data type not handling for var %s", vi);
-      }
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
-    }
-    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
-                       ? true
-                       : false;
-    bool is_persistable =
-        (persistables_.find(vi) != persistables_.end()) ? true : false;
-    if (is_test && is_persistable) {
-      ti->set_stale(false);
-    }
-    t_in.push_back(ti);
-  }
-
-  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto var_name = var_out_[i];
-    auto* var = scope.FindVar(var_name);
-    std::shared_ptr<ngraph::runtime::Tensor> to;
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto dd = tensor_pd->dims();
-      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(var_name);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Data type not handled in for var %s", var_name);
-      }
-      t_out.push_back(to);
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
-    }
-  }
-
-  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
-}  // NgraphEngine::RunImpl
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
deleted file mode 100644
index ede80f44bea208b66acc3b3f4bc0f4adee4fb860..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ngraph_operator.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/variant.h"
-
-#include "ngraph/type/element_type.hpp"
-
-namespace paddle {
-namespace framework {
-
-class NgraphOperator : public OperatorBase {
- public:
-  static std::vector<
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  NgraphOpIntervals(
-      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
-
-  explicit NgraphOperator(
-      const ProgramDesc& prog, size_t block_id,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
-      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
-
-  void RunImpl(const Scope& scope, const platform::Place& place) const final;
-
- private:
-  const ProgramDesc pdesc_;
-  size_t block_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  bool is_full_ = false;
-
-  void Process();
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 38f811c0e9e8ecc6a4226e17e621a3c2ef3f78c5..9d6c10ab9e33d0e9888fa484030be9da7752512e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -557,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
-  auto names = op().Outputs(name);
+  auto it = ctx_.outputs.find(name);
+  if (it == ctx_.outputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
   std::vector<Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) -> Tensor* {
-                   auto var = scope_.FindVar(sub_name);
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, ToTypeName(var->Type()));
-                   return var->GetMutable<LoDTensor>();
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> Tensor* {
+                   return var == nullptr ? nullptr
+                                         : var->GetMutable<LoDTensor>();
                  });
   return res;
 }
@@ -1075,7 +1072,9 @@ Scope* OperatorWithKernel::PrepareData(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  int data_type = -1;
+  proto::VarType::Type dafault_data_type =
+      static_cast<proto::VarType::Type>(-1);
+  proto::VarType::Type data_type = dafault_data_type;
   for (auto& input : this->inputs_) {
     const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
     for (size_t i = 0; i < vars.size(); ++i) {
@@ -1092,18 +1091,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         if (t != nullptr) {
           PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
                          input.first, i);
-          int tmp = static_cast<int>(t->type());
+          proto::VarType::Type tmp = t->type();
           PADDLE_ENFORCE(
-              tmp == data_type || data_type == -1,
+              tmp == data_type || data_type == dafault_data_type,
               "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), data_type, tmp);
+              Type(), DataTypeToString(data_type), DataTypeToString(tmp));
           data_type = tmp;
         }
       }
     }
   }
-  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<proto::VarType::Type>(data_type);
+  PADDLE_ENFORCE(data_type != dafault_data_type,
+                 "DataType should be indicated by input");
+  return data_type;
 }
 
 OpKernelType OperatorWithKernel::GetExpectedKernelType(
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d..ef5404e4755817cefc925acbf4882ff86d1f0ba3 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
+                 DataTypeToString(type_));
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index a730b84a916ea2c3e17dd4becaf939cc28160457..5db422119966948f75970874e13d416ea699158a 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
 cc_library(engine SRCS engine.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index b7df4b8886d629e98225c95eae9a4f2ed9400710..47488d4dea79f285769f29c93f7888a7f783f070 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/layer.h"
+
 #include <deque>
 #include <limits>
 #include <map>
@@ -22,6 +23,9 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
 
 using framework::Variable;
 
-void AddTo(Variable* src, Variable* dst) {
-  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+namespace detail {
+
+template <typename T>
+class TensorAddToFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddToFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+
+  void operator()(const platform::CPUPlace& place) {
+    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void operator()(const platform::CUDAPlace& place) {
+    platform::CUDADeviceContext* ctx =
+        dynamic_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const platform::CUDAPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+#endif
+
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+
+}  // namespace detail
+
+void AddTo(Variable* src, Variable* dst, platform::Place place) {
+  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+
   // FIXME(minqiyang): loss_grad op will pass a zero grad of label
   // ugly fix for it
   if (src_tensor->numel() == 0) {
     return;
   }
+
   PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
                  "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
                  src_tensor->numel());
-  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
-  const float* src_data = src_tensor->data<float>();
-  for (int64_t i = 0; i < src_tensor->numel(); ++i) {
-    dst_data[i] += src_data[i];
-  }
+
+  detail::TensorAddToFunctor<float> func(
+      src_tensor->numel(), src_tensor->data<float>(),
+      dst_tensor->mutable_data<float>(place));
+  boost::apply_visitor(func, place);
 }
 
 class Autograd {
@@ -108,6 +156,8 @@ class Autograd {
       for (auto it : candidate->pre_ops_) {
         for (OpBase* pre_op : it.second) {
           if (!pre_op) continue;
+          VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- "
+                  << it.first << " <---- " << pre_op->op_desc_->Type();
           if (visited.find(pre_op) == visited.end()) {
             visited.insert(pre_op);
             queue.push_back(pre_op);
@@ -120,66 +170,104 @@ class Autograd {
   }
 };
 
+std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
+                                             const bool blocking) const {
+  PADDLE_ENFORCE(var_->IsInitialized(),
+                 "Variable must be initialized when getting numpy tensor");
+
+  std::unique_ptr<VarBase> new_var(new VarBase());
+  framework::LoDTensor* tensor =
+      new_var->var_->GetMutable<framework::LoDTensor>();
+  tensor->Resize(var_->Get<framework::LoDTensor>().dims());
+  tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
+
+  if (blocking) {
+    platform::DeviceContext* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(dst_place);
+
+    framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
+                              tensor);
+
+    dev_ctx->Wait();
+  } else {
+    framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
+  }
+
+  if (platform::is_gpu_place(dst_place)) {
+    VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
+  }
+
+  return new_var;
+}
+
 framework::LoDTensor& VarBase::GradValue() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_ && backward_id_ <= 0) {
+  if (grad_op_descs_.empty() && backward_id_ <= 0) {
     LOG(WARNING) << "op with no grad: " << op_desc_->Type();
     return {};
   }
 
-  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  std::vector<framework::VariableValueMap> grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
-    grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
-        backward_id_,
-        grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
+    grad_outputs.resize(1);
+    grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
+        PyLayer::ApplyGrad(
+            backward_id_,
+            grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
   } else {
-    VLOG(3) << "op grad " << grad_op_desc_->Type();
-    for (auto it : grad_output_vars_) {
-      auto& outputs = grad_outputs[it.first];
-      for (size_t i = 0; i < it.second.size(); ++i) {
-        // Allocate a new variable
-        Variable* tmp_var = new framework::Variable();
-        tmp_var->GetMutable<framework::LoDTensor>();
-        outputs.push_back(tmp_var);
+    grad_outputs.resize(grad_op_descs_.size());
+    for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
+      framework::OpDesc* grad_op_desc = grad_op_descs_[k];
+      VLOG(3) << "op grad " << grad_op_desc->Type();
+      for (auto it : grad_output_vars_[k]) {
+        auto& outputs = grad_outputs[k][it.first];
+        for (size_t i = 0; i < it.second.size(); ++i) {
+          // Allocate a new variable
+          Variable* tmp_var = new framework::Variable();
+          tmp_var->GetMutable<framework::LoDTensor>();
+          outputs.push_back(tmp_var);
+        }
       }
-    }
 
-    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+      framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
 
-    // No need to do compile time infer shape here.
-    // grad_op_desc_->InferShape(*block_);
-    grad_op_desc_->InferVarType(block_);
+      // No need to do compile time infer shape here.
+      // grad_op_desc_->InferShape(*block_);
+      grad_op_desc->InferVarType(block_);
 
-    std::unique_ptr<framework::OperatorBase> opbase =
-        framework::OpRegistry::CreateOp(*grad_op_desc_);
-    framework::OperatorWithKernel* op_kernel =
-        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+      std::unique_ptr<framework::OperatorBase> opbase =
+          framework::OpRegistry::CreateOp(*grad_op_desc);
+      framework::OperatorWithKernel* op_kernel =
+          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
-    framework::Scope scope;
-    platform::CPUPlace place;
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-    p.op.RuntimeInferShape(scope, place, ctx);
-    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+      framework::Scope scope;
+      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
+      p.op.RuntimeInferShape(scope, place_, ctx);
+      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+    }
   }
 
-  for (auto it : grad_output_vars_) {
-    auto& outputs = grad_outputs[it.first];
-    auto& origin_outputs = it.second;
-    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
-
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      framework::Variable* grad = outputs[i];
-      framework::Variable* orig_grad = origin_outputs[i];
-      AddTo(grad, orig_grad);
-      delete grad;
+  for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
+    for (auto it : grad_output_vars_[k]) {
+      auto& outputs = grad_outputs[k][it.first];
+      auto& origin_outputs = it.second;
+      PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
+
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        framework::Variable* grad = outputs[i];
+        framework::Variable* orig_grad = origin_outputs[i];
+        AddTo(grad, orig_grad, place_);
+        delete grad;
+      }
     }
   }
+
   return input_vars_;
 }
 
@@ -188,8 +276,10 @@ void VarBase::RunBackward() {
 
   VLOG(3) << "start backward";
   auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
-  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
-  std::fill(data, data + grads_t->numel(), 1.0);
+  operators::math::set_constant(
+      *(platform::DeviceContextPool::Instance().Get(
+          var_->GetMutable<framework::LoDTensor>()->place())),
+      grads_t, 1.0);
 
   PADDLE_ENFORCE(
       grads_ ==
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 0b1077c640e076797ba7e0200dc8d0eb8bfcff16..78205486c5534ac0c61cc6d545bdafa4dfc95695 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -21,17 +21,22 @@
 #include <map>     // NOLINT
 #include <string>  // NOLINT
 #include <vector>  // NOLINT
+#include <memory>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace imperative {
 
+class VarBase;
+
 namespace py = ::pybind11;
 
 class PreparedOp {
@@ -81,6 +86,8 @@ class PreparedOp {
     return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
   }
 
+  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
+
   const framework::OperatorBase& op;
   const framework::RuntimeContext& ctx;
   framework::OperatorWithKernel::OpKernelFunc func;
@@ -134,20 +141,31 @@ class VarBase {
   void RunBackward();
 
   void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
-                  int pre_op_out_idx, bool stop_gradient) {
+                  int pre_op_out_idx, bool pre_op_stop_gradient) {
     pre_op_ = pre_op;
     pre_op_out_name_ = pre_op_out_name;
     pre_op_out_idx_ = pre_op_out_idx;
-    stop_gradient_ = stop_gradient;
+    if (pre_op_stop_gradient) {
+      stop_gradient_ = pre_op_stop_gradient;
+    }
   }
 
   void ClearGradient() {
-    delete grads_;
-    grads_ = new VarBase(true);
+    VLOG(1) << "clear gradient of " << var_desc_->Name();
+    if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
+      auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
+      operators::math::set_constant(
+          *(platform::DeviceContextPool::Instance().Get(
+              grads_->var_->Get<framework::LoDTensor>().place())),
+          grads_t, 0.0);
+    }
   }
 
   framework::LoDTensor& GradValue();
 
+  std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
+                                      const bool blocking) const;
+
   inline std::string GradName() const {
     PADDLE_ENFORCE(
         var_desc_,
@@ -175,11 +193,13 @@ class OpBase {
   OpBase()
       : op_desc_(nullptr),
         forward_id_(-1),
-        grad_op_desc_(nullptr),
-        backward_id_(-1) {}
+        backward_id_(-1),
+        place_(platform::CPUPlace()) {}
 
   virtual ~OpBase() {
-    if (grad_op_desc_) delete grad_op_desc_;
+    for (framework::OpDesc* desc : grad_op_descs_) {
+      delete desc;
+    }
   }
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
@@ -188,18 +208,25 @@ class OpBase {
   // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
   framework::OpDesc* op_desc_;
   int forward_id_;
-  // When has backward, one of `grad_op_desc_` or `backward_id_` is set,
+
+  // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
   // not both.
-  framework::OpDesc* grad_op_desc_;
+  // Note: each fwd op corresponds to a vector of bwd ops.
+  std::vector<framework::OpDesc*> grad_op_descs_;
   int backward_id_;
 
+  platform::Place place_;
+
   VarBasePtrMap input_vars_;
   VarBasePtrMap output_vars_;
   OpBasePtrMap pre_ops_;
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
-  framework::VariableValueMap grad_input_vars_;
-  framework::VariableValueMap grad_output_vars_;
+  // Inputs to a vector of bwd ops.
+  std::vector<framework::VariableValueMap> grad_input_vars_;
+  // Outputs to a vector of bwd ops.
+  std::vector<framework::VariableValueMap> grad_output_vars_;
+
   framework::BlockDesc* block_;
 };
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 843fee41f38f1247473ba06978248659495f8585..bc39d11ba00a6a7c386162a1f9201c6f992c8692 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,33 +14,61 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace imperative {
 
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
-                  framework::OpDesc** grad_op_desc,
+                  std::vector<framework::OpDesc*>* grad_op_descs,
                   std::unordered_map<std::string, std::string>* grad_to_var) {
-  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+  PADDLE_ENFORCE(grad_op_descs->empty());
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
       framework::OpInfoMap::Instance()
           .Get(op_desc.Type())
           .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
-  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
-  // TODO(panyx0718): Leak?
-  *grad_op_desc = grad_op_descs[0].release();
+
+  for (auto& desc : descs) {
+    grad_op_descs->emplace_back(desc.release());
+  }
 }
 
-void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+void InitVar(framework::Variable* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx) {
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx,
+                          "Could not get valid device from forward op");
   auto& var_t = var->Get<framework::LoDTensor>();
-  float* data =
-      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-          var_t.dims(), platform::CPUPlace());
-  std::fill(data, data + var_t.numel(), 0.0);
+  grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      var_t.dims(), dev_ctx->GetPlace());
+  operators::math::set_constant(
+      *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
+}
+
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
+  platform::Place result = place;
+  for (auto it : inputs) {
+    for (VarBase* var : it.second) {
+      platform::Place tmp_place =
+          var->var_->Get<framework::LoDTensor>().place();
+      if (!platform::is_same_place(tmp_place, result)) {
+        PADDLE_THROW(
+            "Input variable should keep in the same place: %s, but get place: "
+            "%s of input %s instead",
+            result, tmp_place, it.first);
+      }
+    }
+  }
+
+  return result;
 }
 
 void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                    const VarBasePtrMap& outputs, framework::BlockDesc* block,
+                   const platform::Place expected_place,
                    const bool stop_gradient) {
   std::map<std::string, VarBase*> vars;
 
@@ -57,11 +85,12 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   op->input_vars_ = inputs;
   for (auto it : op->input_vars_) {
     auto& invars = invars_map[it.first];
+    invars.reserve(it.second.size());
     for (VarBase* inp : it.second) {
       PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
                               op->op_desc_->Type(), inp->var_desc_->Name());
 
-      invars.push_back(inp->var_);
+      invars.emplace_back(inp->var_);
       vars[inp->var_desc_->Name()] = inp;
       if (inp->PreOp()) {
         op->pre_ops_[it.first].push_back(inp->PreOp());
@@ -78,9 +107,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   for (auto it : op->output_vars_) {
     auto& outvars = outvars_map[it.first];
     const std::vector<VarBase*>& outputs = it.second;
+    outvars.reserve(outputs.size());
     for (size_t i = 0; i < outputs.size(); ++i) {
       VarBase* out = outputs[i];
-      outvars.push_back(out->var_);
+      outvars.emplace_back(out->var_);
       vars[out->var_desc_->Name()] = out;
 
       framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
@@ -105,51 +135,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-  p.op.RuntimeInferShape(scope, place, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  op->place_ = GetExpectedPlace(expected_place, inputs);
+  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
+  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
+  prepared_op.func(framework::ExecutionContext(
+      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
 
   if (!stop_gradient) {
-    framework::OpDesc* grad_op_desc;
-    // TODO(panyx): Is this leaked?
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
         new std::unordered_map<std::string, std::string>());
-    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
-    op->grad_op_desc_ = grad_op_desc;
-
-    for (auto it : grad_op_desc->Inputs()) {
-      auto& grad_in_vars = op->grad_input_vars_[it.first];
-      for (const std::string& grad_invar : it.second) {
-        block->FindRecursiveOrCreateVar(grad_invar);
-        auto var_it = grad_to_var->find(grad_invar);
-        if (var_it == grad_to_var->end()) {
-          auto fwd_var_it = vars.find(grad_invar);
-          PADDLE_ENFORCE(fwd_var_it != vars.end());
-          // Forward inputs or outputs.
-          grad_in_vars.push_back(fwd_var_it->second->var_);
-        } else {
-          VarBase* var = vars[var_it->second];
-          if (!var->grads_->var_->IsInitialized()) {
-            InitVar(var->var_, var->grads_->var_);
+    CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
+
+    op->grad_input_vars_.resize(op->grad_op_descs_.size());
+    op->grad_output_vars_.resize(op->grad_op_descs_.size());
+    for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
+      framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[i][it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            // Forward inputs or outputs.
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->var_->IsInitialized()) {
+              InitVar(var->var_, var->grads_->var_,
+                      prepared_op.GetDeviceContext());
+            }
+            // Douts.
+            grad_in_vars.push_back(var->grads_->var_);
           }
-          // Douts.
-          grad_in_vars.push_back(var->grads_->var_);
         }
       }
-    }
 
-    for (auto it : grad_op_desc->Outputs()) {
-      auto& grad_out_vars = op->grad_output_vars_[it.first];
-      for (const std::string& grad_outvar : it.second) {
-        block->FindRecursiveOrCreateVar(grad_outvar);
-        auto var_it = grad_to_var->find(grad_outvar);
-        PADDLE_ENFORCE(var_it != grad_to_var->end());
-        VarBase* var = vars[var_it->second];
-        if (!var->grads_->var_->IsInitialized()) {
-          InitVar(var->var_, var->grads_->var_);
+      for (auto it : grad_op_desc->Outputs()) {
+        auto& grad_out_vars = op->grad_output_vars_[i][it.first];
+        for (const std::string& grad_outvar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end(),
+                         "Could not found the grad op output var, should this "
+                         "operator %s's stop gradient be True",
+                         op_desc->Type());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->var_->IsInitialized()) {
+            InitVar(var->var_, var->grads_->var_,
+                    prepared_op.GetDeviceContext());
+          }
+          grad_out_vars.push_back(var->grads_->var_);
         }
-        grad_out_vars.push_back(var->grads_->var_);
       }
     }
   }
@@ -178,10 +216,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
   }
   if (!stop_gradient) {
+    op->grad_input_vars_.resize(1);
+    op->grad_output_vars_.resize(1);
     auto& grad_input_vars =
-        op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)];
+        op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
     auto& grad_output_vars =
-        op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)];
+        op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
 
     for (const VarBase* inp : inputs) {
       grad_input_vars.push_back(inp->var_);
@@ -189,16 +229,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->var_);
     }
+
+    platform::CPUPlace place;
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->grads_->var_);
       if (!grad_input_vars.back()->IsInitialized()) {
-        InitVar(out->var_, grad_input_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(out->var_, grad_input_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
+
     for (const VarBase* inp : inputs) {
       grad_output_vars.push_back(inp->grads_->var_);
       if (!grad_output_vars.back()->IsInitialized()) {
-        InitVar(inp->var_, grad_output_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(inp->var_, grad_output_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index f225d8abe6c0635d2bdd8dba0b12c7fc3a4110db..690838215581b09ff35a0ea13f30655b77e6e187 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace imperative {
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 
 void InitVar(framework::Variable* var, framework::Variable* grad_var);
 
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
+
 class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op,
-             const std::map<std::string, std::vector<VarBase*>>& inputs,
-             const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block, const bool stop_gradient = false);
+  void Trace(OpBase* op, const VarBasePtrMap& inputs,
+             const VarBasePtrMap& outputs, framework::BlockDesc* block,
+             const platform::Place expected_place,
+             const bool stop_gradient = false);
 
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
 
  private:
+  platform::Place GetPlace(const VarBasePtrMap& inputs);
+
   framework::BlockDesc* root_block_;
 };
 
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 88ce61f9b928aba1945bddc1f9f6b785834780ca..2f31b182af7293488719e41a92b2ea78709bda02 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
@@ -130,10 +131,14 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
+                      AnalysisConfig::Precision);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
+                      StaticMemoryOptimForceUpdate, bool);
   // Indicate which kind of sort algorithm is used for operators, the memory
   // optimization relays on the sort algorithm.
   DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index ca40c01fc57dbcc2ca16770a1b7d798de8b5625b..4f5c50d0d6b9ac94130cb82fb342ae5ee592f2c0 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
   attr->set_i(data);
 }
 template <>
+void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
+                   const bool &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(data);
+}
+template <>
 void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
                       const int64_t &data) {
   auto *attr = op->add_attrs();
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index de04713b531dc421b885473cc8956e8ba6b63574..59107f28080dceb0a58e17d42281db5f3773de56 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -29,9 +30,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 
 #ifdef _WIN32
-#define GCC_ATTRIBUTE(attr__) ;
+#include <direct.h>
+#include <io.h>
+#define GCC_ATTRIBUTE(attr__)
+#define MKDIR(path) _mkdir(path)
 #else
+#include <unistd.h>
 #define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
 #endif
 #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
 
@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
   return false;
 }
 
+static std::string GetDirRoot(const std::string &path) {
+  char sep = '/';
+
+#ifdef _WIN32
+  sep = '\\';
+#endif
+
+  size_t i = path.rfind(sep, path.length());
+  if (i != std::string::npos) {
+    return (path.substr(0, i));
+  }
+  return path;
+}
+
+static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
+  std::string opt_cache_dir = model_root + "/_opt_cache/";
+  if (!PathExists(opt_cache_dir)) {
+    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
+                   "Can not create optimize cache directory: %s, Make sure you "
+                   "have permission to write",
+                   opt_cache_dir);
+  }
+  return opt_cache_dir;
+}
+
+static std::string GetTrtCalibPath(const std::string &model_root,
+                                   const std::string &engine_key) {
+  return model_root + "/trt_calib_" + engine_key;
+}
+
+// If there is no calib table data file in model_opt_cache_dir, return "".
+static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
+                                        const std::string &engine_key,
+                                        bool enable_int8) {
+  std::string trt_calib_table_path =
+      GetTrtCalibPath(model_opt_cache_dir, engine_key);
+  if (enable_int8 && FileExists(trt_calib_table_path)) {
+    VLOG(3) << "Calibration table file: " << trt_calib_table_path
+            << "is found here";
+    std::ifstream infile(trt_calib_table_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string calibration_data(buffer.str());
+    return calibration_data;
+  }
+  return "";
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4e1464226450b833e6d8dae2be2dcad89dd1e5e4..fe3c841186c35ea28c1d44007d91de5b997c1388 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
                 new int(argument->tensorrt_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+
+      bool enable_int8 = argument->tensorrt_precision_mode() ==
+                         AnalysisConfig::Precision::kInt8;
+
+      pass->Set("enable_int8", new bool(enable_int8));
+      std::string model_opt_cache_dir =
+          argument->Has("model_dir")
+              ? argument->model_dir()
+              : GetDirRoot(argument->model_program_path());
+      pass->Set(
+          "model_opt_cache_dir",
+          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
     }
 
     // graph_ = pass->Apply(std::move(graph_));
@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
 }
 
 framework::proto::ProgramDesc IRPassManager::AcquireProgram(
-    std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
+    std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
   auto pass =
       framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
 
-  ProgramDesc desc(program);
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  ProgramDesc desc;
+  desc.CopyFrom(*program->Proto());
   pass->SetNotOwned("program", &desc);
   auto *the_graph = graph->release();
   *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 983a582649706fa6eedb5aa459b5ac53b98f658b..2a595cb36b8345157b3fd26afc62aabfa98b87bc 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -29,6 +29,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -42,8 +43,8 @@ class IRPassManager final {
 
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
 
-  framework::proto::ProgramDesc AcquireProgram(
-      std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
+  framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
+                                               ProgramDesc *program) const;
 
   framework::ir::Graph &graph() const { return *graph_; }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 5f25303cc1eaa6b563f0f8f4289b38499eb487cc..69a9caec030600332c9f11ba255e4e642bd41e96 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
   return graph;
 }
 
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                             Graph *graph) const {
   auto *op_desc = node->Op();
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
 
+  framework::ProgramDesc *program_desc =
+      Get<framework::ProgramDesc *>("program");
+  // Add new block for TensorRTEngineOP
+  const framework::BlockDesc &main_block =
+      program_desc->Block(framework::kRootBlockIndex);
+  // const framework::BlockDesc& main_block = program_desc->Block(0);
+  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
+
   // An fake block desc.
   framework::proto::BlockDesc block_proto;
   framework::BlockDesc block_desc(nullptr, &block_proto);
@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                           subgraph.size());
 
   for (auto *node : subgraph) {
+    auto *new_block_op = new_block->AppendOp();
     auto *op = block_desc.AppendOp();
+    *new_block_op->Proto() = *node->Op()->Proto();
     *op->Proto() = *node->Op()->Proto();
   }
 
-  // collect inputs
-  std::unordered_set<std::string> input_names;
-  std::unordered_set<std::string> input_names_with_id;
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the eigine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   op_desc->SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  std::unordered_set<std::string> output_names;
-  std::unordered_set<std::string> output_names_with_id;
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // to Tensor.
   std::vector<std::string> output_mapping;
   for (auto name : output_names) {
-    // LOG(INFO) << name << " " << output_name_map.size();
     PADDLE_ENFORCE(output_name_map.count(name) != 0);
     output_mapping.push_back(output_name_map[name]);
   }
@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
       *vars->Add() = *node->Var()->Proto();
     }
   }
+
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
   PADDLE_ENFORCE(!output_mapping.empty());
-  // Set attrs
+  op_desc->SetBlockAttr("sub_block", new_block);
   SetAttr(op_desc->Proto(), "subgraph",
           block_desc.Proto()->SerializeAsString());
+  // Set attrs
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
   SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+
+  auto enable_int8 = Get<bool>("enable_int8");
+  auto engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id);
+
+  std::string calibration_data = GetTrtCalibTableData(
+      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
+
+  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
+  SetAttr(op_desc->Proto(), "engine_key", engine_key);
 }
 
 std::vector<std::string> ExtractParameters(
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 691c336ebe4b6a6cb60023859b21665b6a4756a8..9d74dc6c211e4fcb6d1e7de5369eee847f49fc78 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index f1da37af3cc5fa55eb66a1822aefe96eda1dc4fb..6b3d80fcef0be1527062edbb37ea39cc5d95a168 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
   }
 
   std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-  framework::ProgramDesc desc(argument->main_program());
+
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  framework::ProgramDesc desc;
+  desc.CopyFrom(*argument->main_program().Proto());
   pass->SetNotOwned("program", &desc);
   auto thegraph = pass->Apply(std::move(graph));
   thegraph.release();  // the argument still own the graph.
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 57683c0b727ef1c922e3a308db28d0af4f193602..3d1be9196fdeacd8ff852dbb595473a687352ccf 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
   return batch_shapes;
 }
 
+// Replace the -1 in shape to a real number to fake the shape.
+std::vector<std::map<std::string, std::vector<int>>> FakeBatchVarShapes(
+    const framework::ProgramDesc& program) {
+  std::vector<std::map<std::string, std::vector<int>>> res;
+  res.emplace_back();
+  auto& record = res.front();
+  const int fake_batch_size = 3;
+  for (auto* var : program.Block(0).AllVars()) {
+    if (var->GetType() ==
+        framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      auto shape = var->GetShape();
+      for (auto& v : shape) {
+        if (v < 0) v = fake_batch_size;
+      }
+      record[var->Name()].assign(shape.begin(), shape.end());
+    }
+  }
+  return res;
+}
+
 // Calculate the average dim of each tensor from the batch shape cache.
 std::unordered_map<std::string, size_t> GetBatchAverageSize(
     const std::vector<std::map<std::string, std::vector<int>>>& batches) {
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
   std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
   for (auto& batch : batches) {
     for (auto& ele : batch) {
+      PADDLE_ENFORCE(!ele.second.empty());
       int batch_size = ele.second.front();
       // TODO(Superjomn) might consume large memory here, use combine hash.
       var_batchsize_hashes[ele.first] << batch_size;
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
 
 std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
 
+std::pair<size_t, size_t> GetRange(
+    const std::unordered_map<std::string, size_t>& ave_size) {
+  auto res = std::make_pair(std::numeric_limits<size_t>::max(),
+                            std::numeric_limits<size_t>::min());
+  for (auto& item : ave_size) {
+    res.first = std::min(item.second, res.first);
+    res.second = std::max(item.second, res.second);
+  }
+  return res;
+}
+
 void MemoryOptimizePass::RunImpl(Argument* argument) {
   // When force update, should not optimize memory.
-  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+  if (!argument->enable_memory_optim() ||
+      argument->static_memory_optim_force_update())
     return;
   graph_ = argument->main_graph_ptr();
 
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
       argument->model_program_path_valid() ? argument->model_program_path()
                                            : "");
   VLOG(3) << "Load memory cache from " << path;
-  if (inference::IsFileExists(path)) {
-    VLOG(4) << "Performing memory optimize";
-    auto batches = DeseralizeBatchVarShapes(path);
-    auto var_batch_ave_size = GetBatchAverageSize(batches);
+  std::vector<std::map<std::string, std::vector<int>>> batches;
+
+  if (argument->static_memory_optim() && inference::IsFileExists(path)) {
+    string::PrettyLogInfo("--- Performing static memory optimize");
+    batches = DeseralizeBatchVarShapes(path);
+  } else {
+    string::PrettyLogInfo("--- Performing dynamic memory optimize");
+    batches = FakeBatchVarShapes(argument->main_program());
+  }
+  auto var_batch_ave_size = GetBatchAverageSize(batches);
+
+  // Get min and max memory size.
+  const auto range = GetRange(var_batch_ave_size);
+  const int cluster_size = std::max(
+      static_cast<int>((range.second - range.first) / 100 /*cluster num*/),
+      1024);
+  const int cluster_size1 = std::max(
+      static_cast<int>((range.second - range.first) / 1000 /*cluster num*/),
+      1024);
 
-    std::unordered_map<std::string, Node*> tensor_nodes;
-    space_table_t space_table;
-    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+  std::unordered_map<std::string, Node*> tensor_nodes;
+  space_table_t space_table;
+  CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
 
-    std::unordered_map<std::string, std::string> reuse_table;
-    double max_saving_ratio = 0.;
+  std::unordered_map<std::string, std::string> reuse_table;
+  double max_saving_ratio = 0.;
 
-    std::vector<std::function<MemoryAllocation()>> strategies;
+  std::vector<std::function<MemoryAllocation()>> strategies;
 
-    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+  for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+    if (argument->static_memory_optim()) {
+      // This strategy only make scene in static memory optimize.
       strategies.emplace_back([&, sort_kind] {
         auto clustered_vars_by_batch_size =
             AnalysisBatchShapesByBatchSize(batches);
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
                       space_table, &reuse_table, sort_kind, &allocation);
         return allocation;
       });
+    }
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches, 1024);  // interval 1kb
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size =
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size);
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size =
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1);
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+          space_table, batches,
+          std::numeric_limits<int>::max());  // no intervals
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+  }
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches, 1024 * 1024);  // interval 1MB
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+  std::function<MemoryAllocation()>* best_strategy{nullptr};
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches,
-            std::numeric_limits<int>::max());  // no intervals
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+  // Try all strategies to get the best result.
+  for (auto& strategy : strategies) {
+    auto allocation = strategy();
+    string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+                            allocation.GetSavingRatio());
+    if (allocation.GetSavingRatio() > max_saving_ratio) {
+      max_saving_ratio = allocation.GetSavingRatio();
+      best_strategy = &strategy;
     }
+  }
+  if (!best_strategy) {
+    LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize";
+    return;
+  }
+  auto memory_allocation = (*best_strategy)();
 
-    std::function<MemoryAllocation()>* best_strategy{nullptr};
+  string::PrettyLogInfo(
+      "--- Saved %.2f%s memory for workspace(temporary variables)",
+      memory_allocation.GetSavingRatio() * 100, "%");
 
-    // Try all strategies to get the best result.
-    for (auto& strategy : strategies) {
-      auto allocation = strategy();
-      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
-                              allocation.GetSavingRatio());
-      if (allocation.GetSavingRatio() > max_saving_ratio) {
-        max_saving_ratio = allocation.GetSavingRatio();
-        best_strategy = &strategy;
-      }
-    }
-    if (!best_strategy) {
-      LOG(ERROR)
-          << "This model makes poor memory optimize, skip memory optimize";
-      return;
-    }
-    auto memory_allocation = (*best_strategy)();
-
-    string::PrettyLogH2(
-        "--- Saved %.2f%s memory for workspace(temporary variables)",
-        memory_allocation.GetSavingRatio() * 100, "%");
-    string::PrettyLogDetail("--- Allocated %d MB",
-                            memory_allocation.allocated / 1024. / 1024.);
-    string::PrettyLogDetail("--- Saved %d MB",
-                            memory_allocation.saved / 1024. / 1024.);
-    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
-                               new std::unordered_set<std::string>);
-    auto& vars2remove =
-        argument->main_graph().Get<std::unordered_set<std::string>>(
-            framework::ir::kGraphToProgramVarsToRemove);
-
-    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
-    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
-  }
+  argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
+                             new std::unordered_set<std::string>);
+  auto& vars2remove =
+      argument->main_graph().Get<std::unordered_set<std::string>>(
+          framework::ir::kGraphToProgramVarsToRemove);
+
+  PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
+  argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
 }
 
 float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index fa1ad9c8c6aeff60ec4468f41140c57be790af7f..2da565f2ae15a50a207173b10d4c350456086582 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #pragma once
-
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index f9da3004ed8306ef08144d096afa4f86133e492d..eecab238a88e90399eb70f17caa57633af4e2a69 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -22,7 +22,7 @@
 
 namespace paddle {
 
-PassStrategy *contrib::AnalysisConfig::pass_builder() const {
+PassStrategy *AnalysisConfig::pass_builder() const {
   if (!pass_builder_.get()) {
     if (use_gpu_) {
       LOG(INFO) << "Create GPU IR passes";
@@ -42,27 +42,27 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
   return pass_builder_.get();
 }
 
-contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
+AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
   model_dir_ = model_dir;
 
   Update();
 }
-contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
-                                        const std::string &params_file) {
+AnalysisConfig::AnalysisConfig(const std::string &prog_file,
+                               const std::string &params_file) {
   prog_file_ = prog_file;
   params_file_ = params_file;
 
   Update();
 }
-void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
-                                       const std::string &params_file_path) {
+void AnalysisConfig::SetModel(const std::string &prog_file_path,
+                              const std::string &params_file_path) {
   prog_file_ = prog_file_path;
   params_file_ = params_file_path;
 
   Update();
 }
-void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
-                                           int device_id) {
+void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
+                                  int device_id) {
 #ifdef PADDLE_WITH_CUDA
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
@@ -74,13 +74,13 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 
   Update();
 }
-void contrib::AnalysisConfig::DisableGpu() {
+void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
   Update();
 }
 
-contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
+AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
   // Model related.
@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   CP_MEMBER(memory_pool_init_size_mb_);
 
   CP_MEMBER(enable_memory_optim_);
-  CP_MEMBER(memory_optim_force_update_);
+  CP_MEMBER(static_memory_optim_);
+  CP_MEMBER(static_memory_optim_force_update_);
   // TensorRT releated.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
+  CP_MEMBER(tensorrt_precision_mode_);
   // MKLDNN releated.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -128,7 +130,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   Update();
 }
 
-void contrib::AnalysisConfig::EnableMKLDNN() {
+void AnalysisConfig::EnableMKLDNN() {
 #ifdef PADDLE_WITH_MKLDNN
   pass_builder()->EnableMKLDNN();
   use_mkldnn_ = true;
@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
-void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
-                                                   int max_batch_size,
-                                                   int min_subgraph_size) {
+void AnalysisConfig::EnableTensorRtEngine(
+    int workspace_size, int max_batch_size, int min_subgraph_size,
+    AnalysisConfig::Precision precision_mode) {
 #ifdef PADDLE_WITH_CUDA
   if (!use_gpu()) {
     LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
+  tensorrt_precision_mode_ = precision_mode;
 
   Update();
 #else
@@ -162,7 +165,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
 }
 
 // TODO(Superjomn) refactor this, buggy.
-void contrib::AnalysisConfig::Update() {
+void AnalysisConfig::Update() {
   auto info = SerializeInfoCache();
   if (info == serialized_info_cache_) return;
 
@@ -222,7 +225,7 @@ void contrib::AnalysisConfig::Update() {
   }
 }
 
-std::string contrib::AnalysisConfig::SerializeInfoCache() {
+std::string AnalysisConfig::SerializeInfoCache() {
   std::stringstream ss;
   ss << model_dir_;
   ss << prog_file_;
@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_min_subgraph_size_;
 
   ss << enable_memory_optim_;
-  ss << memory_optim_force_update_;
+  ss << static_memory_optim_;
+  ss << static_memory_optim_force_update_;
 
   ss << use_mkldnn_;
   for (auto &item : mkldnn_enabled_op_types_) ss << item;
@@ -256,14 +260,14 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
   return ss.str();
 }
 
-void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
+void AnalysisConfig::SetCpuMathLibraryNumThreads(
     int cpu_math_library_num_threads) {
   cpu_math_library_num_threads_ = cpu_math_library_num_threads;
 
   Update();
 }
 
-float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
+float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #ifdef PADDLE_WITH_CUDA
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
@@ -278,21 +282,23 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
-void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+void AnalysisConfig::EnableMemoryOptim(bool static_optim,
+                                       bool force_update_static_cache) {
   enable_memory_optim_ = true;
-  memory_optim_force_update_ = force_update_cache;
+  static_memory_optim_ = static_optim;
+  static_memory_optim_force_update_ = force_update_static_cache;
 
   Update();
 }
 
-bool contrib::AnalysisConfig::enable_memory_optim() const {
+bool AnalysisConfig::enable_memory_optim() const {
   return enable_memory_optim_;
 }
 
-void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
-                                             size_t prog_buffer_size,
-                                             const char *param_buffer,
-                                             size_t param_buffer_size) {
+void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
+                                    size_t prog_buffer_size,
+                                    const char *param_buffer,
+                                    size_t param_buffer_size) {
   prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
   params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
   model_from_memory_ = true;
@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
   Update();
 }
 
+NativeConfig AnalysisConfig::ToNativeConfig() const {
+  NativeConfig config;
+  config.model_dir = model_dir_;
+  config.prog_file = prog_file_;
+  config.param_file = params_file_;
+  config.use_gpu = use_gpu_;
+  config.device = device_id_;
+  config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+  config.specify_input_name = specify_input_name_;
+  return config;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2b0cad5faa0e31cb7546d405e05e36754915f653..14d6ba8c56dc3fe04e27bccadd5a5155547398a4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <algorithm>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <vector>
@@ -25,6 +26,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -37,13 +39,20 @@
 
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+
 #endif
 
 DECLARE_bool(profile);
 
 namespace paddle {
 
-using contrib::AnalysisConfig;
+using inference::Singleton;
+#if PADDLE_WITH_TENSORRT
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
+#endif
 
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
@@ -113,6 +122,15 @@ bool AnalysisPredictor::PrepareProgram(
   if (!program) {
     if (!LoadProgramDesc()) return false;
 
+    // If not cloned, the parameters should be loaded.
+    // If config_.ir_optim() is True, parameters is loaded in
+    // OptimizeInferenceProgram(), but other persistable variables
+    // (like RAW type var) are not created in scope.
+    // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
+    // still need to create other persistable variables.
+    // So in both case, create persistable variables at first.
+    executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
+
     // Optimize the program, and load parameters and modify them in the
     // scope_.
     // This will change the scope_ address.
@@ -120,15 +138,6 @@ bool AnalysisPredictor::PrepareProgram(
       status_ir_optim_enabled_ = true;
       OptimizeInferenceProgram();
     } else {
-      // If the parent_scope is passed, we assert that the persistable variables
-      // are already created, so just create the no persistable variables.
-
-      // If not cloned, the parameters should be loaded
-      // OptimizeInferenceProgram.
-      // So in both cases, just the local variables are needed to load, not the
-      // parematers.
-      executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
-
       // Load parameters
       LOG(INFO) << "load parameters ";
       LoadParameters();
@@ -298,15 +307,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                  framework::Scope *scope) {
   VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetchs_.size());
-  for (size_t i = 0; i < fetchs_.size(); ++i) {
-    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+  outputs->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
     PADDLE_ENFORCE((size_t)idx == i);
     framework::LoDTensor &fetch =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto type = fetch.type();
     auto output = &(outputs->at(i));
-    output->name = fetchs_[idx]->Input("X")[0];
+    output->name = fetches_[idx]->Input("X")[0];
     if (type == framework::proto::VarType::FP32) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
@@ -327,7 +336,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
   argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
-  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
+  argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
+  argument_.SetStaticMemoryOptimForceUpdate(
+      config_.static_memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir().empty()) {
@@ -337,6 +348,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
         !config_.params_file().empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
     PADDLE_ENFORCE(!config_.prog_file().empty());
+    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
+
     argument_.SetModelProgramPath(config_.prog_file());
     argument_.SetModelParamsPath(config_.params_file());
   }
@@ -347,6 +360,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
     argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
+    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
   }
 
   if (config_.use_mkldnn_) {
@@ -361,7 +375,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
   argument_.SetIrAnalysisPasses(passes);
   argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
-  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
+  argument_.SetScopeNotOwned(scope_.get());
   Analyzer().Run(&argument_);
 
   PADDLE_ENFORCE(argument_.scope_valid());
@@ -422,10 +436,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
       feed_names_[op->Output("Out")[0]] = idx;
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= static_cast<size_t>(idx)) {
-        fetchs_.resize(idx + 1);
+      if (fetches_.size() <= static_cast<size_t>(idx)) {
+        fetches_.resize(idx + 1);
       }
-      fetchs_[idx] = op;
+      fetches_[idx] = op;
     }
   }
 }
@@ -567,7 +581,67 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+#if PADDLE_WITH_TENSORRT
+bool AnalysisPredictor::SaveTrtCalibToDisk() {
+  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
+                 "This func can be invoked only in trt mode");
+  auto &block = inference_program_->Block(0);
+  for (auto &op_desc : block.AllOps()) {
+    if (op_desc->Type() == "tensorrt_engine") {
+      std::string engine_name =
+          boost::get<std::string>(op_desc->GetAttr("engine_key"));
+      if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
+        LOG(ERROR) << "You should run the predictor(with trt) on the real data "
+                      "to generate calibration info";
+        return false;
+      }
+      TRTCalibratorEngine *calib_engine =
+          Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
+      LOG(INFO) << "Wait for calib threads done.";
+      calib_engine->calib_->waitAndSetDone();
+      LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
+                   "of time...";
+      calib_engine->thr_->join();
+      std::string calibration_table_data =
+          calib_engine->calib_->getCalibrationTableAsString();
+
+      if (calibration_table_data.empty()) {
+        LOG(ERROR) << "the calibration table is empty.";
+        return false;
+      }
+
+      std::string model_opt_cache_dir =
+          argument_.Has("model_dir")
+              ? argument_.model_dir()
+              : inference::analysis::GetDirRoot(argument_.model_program_path());
+
+      std::string calibration_table_data_path =
+          inference::analysis::GetTrtCalibPath(
+              inference::analysis::GetOrCreateModelOptCacheDir(
+                  model_opt_cache_dir),
+              engine_name);
+
+      std::ofstream ofile(calibration_table_data_path, std::ios::out);
+      LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
+                << calibration_table_data_path;
+      ofile << calibration_table_data;
+      ofile.close();
+    }
+  }
+  // Free all calibrator resources.
+  Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
+  return true;
+}
+#endif
+
 AnalysisPredictor::~AnalysisPredictor() {
+#if PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled() &&
+      config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
+      Singleton<TRTCalibratorEngineManager>::Global().Has()) {
+    SaveTrtCalibToDisk();
+  }
+#endif
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
@@ -638,12 +712,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   // check if the cache exists
   if (!config_.enable_memory_optim()) {
     need = false;
-  } else if (config_.enable_memory_optim() &&
+  } else if (config_.static_memory_optim_ &&
              !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
                  config_.model_dir(), config_.prog_file()))) {
     need = true;
-  } else if (config_.enable_memory_optim() &&
-             config_.memory_optim_force_update_) {
+  } else if (config_.static_memory_optim_ &&
+             config_.static_memory_optim_force_update_) {
     need = true;
   }
 
@@ -651,11 +725,15 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   return need;
 }
 
+std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+  return inference_program_->Proto()->SerializeAsString();
+}
+
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
-    const contrib::AnalysisConfig &config) {
-  return CreatePaddlePredictor<contrib::AnalysisConfig,
-                               PaddleEngineKind::kAnalysis>(config);
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
+    const AnalysisConfig &config) {
+  return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+      config);
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 9095b6ec1af6794c19e94fc9326a48239b3ba145..014df4ee8b6d86232212736c43a9aff32ffee011 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -33,7 +33,6 @@ using inference::analysis::Argument;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
-using contrib::AnalysisConfig;
 
 /** \brief This predictor is based on the original native predictor with IR and
  * Analysis support.
@@ -75,6 +74,8 @@ class AnalysisPredictor : public PaddlePredictor {
 
   void SetMkldnnThreadID(int tid);
 
+  std::string GetSeriazlizedProgram() const override;
+
  protected:
   // For memory optimization.
   bool need_collect_var_shapes_for_memory_optim();
@@ -97,6 +98,21 @@ class AnalysisPredictor : public PaddlePredictor {
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
 
+#if PADDLE_WITH_TENSORRT
+  // When we use Paddle-TRT INT8 engine, we need to generate calibration table
+  // data first,
+  // the calibration table contains the range for each op's input and output,
+  // this whole process can be divided into several steps:
+  //
+  // 1. Builds a 32-bit engine, runs it on the calibration set, and records a
+  // histogram for each
+  // tensor of the distribution of activation values.
+  // 2. Builds a calibration table from the histograms.
+  //
+  // After step 2, we need to store the calibration table on disk
+  bool SaveTrtCalibToDisk();
+#endif
+
 // Some more detailed tests, they are made the friends of the predictor, so that
 // the all the details can be tested.
 #if PADDLE_WITH_TESTING
@@ -106,7 +122,7 @@ class AnalysisPredictor : public PaddlePredictor {
 #endif
 
  private:
-  contrib::AnalysisConfig config_;
+  AnalysisConfig config_;
   Argument argument_;
   std::unique_ptr<NaiveExecutor> executor_;
   platform::Place place_;
@@ -115,7 +131,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::shared_ptr<framework::ProgramDesc> inference_program_;
   std::vector<framework::OpDesc *> feeds_;
   std::map<std::string, size_t> feed_names_;
-  std::vector<framework::OpDesc *> fetchs_;
+  std::vector<framework::OpDesc *> fetches_;
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, wrong results and memory leak, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 4688e93d7102109d2c7ece9ba37bc8f2d311dcf1..6d11b461082d0ed8ba08c9e280bba86737b86e71 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -24,7 +24,6 @@
 DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
-using contrib::AnalysisConfig;
 
 TEST(AnalysisPredictor, analysis_off) {
   AnalysisConfig config;
@@ -215,6 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
   {
     // The first predictor help to cache the memory optimize strategy.
     auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
+    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
 
     // Run several times to check the parameters are not reused by mistake.
     for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6..6cd18277d63200f5bccf180a7ae3196b0ce126ff 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
+#include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
   }
 }
 
+std::string get_version() {
+  std::stringstream ss;
+  ss << "version: " << framework::paddle_version() << "\n";
+  ss << "commit: " << framework::paddle_commit() << "\n";
+  ss << "branch: " << framework::paddle_compile_branch() << "\n";
+  return ss.str();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 54895679ca37362c7267677af80274b8de95e296..e82cb53bf073d3d1ab9a518218edaf430728463f 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -295,7 +295,7 @@ TEST(inference_api_native, image_classification_gpu) {
 #endif
 
 TEST(PassBuilder, Delete) {
-  contrib::AnalysisConfig config;
+  AnalysisConfig config;
   config.DisableGpu();
   config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
   const auto& passes = config.pass_builder()->AllPasses();
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 7a579610eefda24c911edd28b5f3a178aa10ab1e..2c450ef7cead4d5c3870d5e9186eb221e5dc19a0 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
   predictor->Run({}, &outputs);
 }
 
+TEST(paddle_inference_api, get_version) {
+  LOG(INFO) << "paddle version:\n" << get_version();
+  auto version = get_version();
+  ASSERT_FALSE(version.empty());
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 338a0cec161f352781f132aea71dd56f68840c62..f7da55c9ae368763786c1b1fd3e86d942c5e9fe8 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -36,7 +36,7 @@ namespace demo {
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::AnalysisConfig config;
+  paddle::AnalysisConfig config;
   config.EnableUseGpu(100, 0);
   config.SetModel(FLAGS_modeldir + "/__model__",
                   FLAGS_modeldir + "/__params__");
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 5320992b7e78f4aa0ea8950af03038c1953dd027..0d2c418c56db620c71d99b64ee79b18be427cc34 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -34,7 +34,6 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
 
-using contrib::AnalysisConfig;
 /*
  * Use the native and analysis fluid engine to inference the demo.
  */
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 1cee8904500636d7b49e6b4e54595dbce6a79954..9d9ed6a39d8324002a8850deae9bb8dd5af7ef9b 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -29,11 +29,6 @@
 namespace paddle {
 
 class AnalysisPredictor;
-// ==
-//
-// -----------------------------------------------------------------------------------
-// NOTE: The following APIs are not mature yet, we are still working on them.
-namespace contrib {
 
 // NOTE WIP, not stable yet.
 struct AnalysisConfig {
@@ -42,6 +37,10 @@ struct AnalysisConfig {
   explicit AnalysisConfig(const std::string& model_dir);
   explicit AnalysisConfig(const std::string& prog_file,
                           const std::string& params_file);
+  enum class Precision {
+    kFloat32 = 0,
+    kInt8,
+  };
 
   /** Set model with a directory.
    */
@@ -135,7 +134,8 @@ struct AnalysisConfig {
    * subgraph is less than this, it will not transfer to TensorRT engine.
    */
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1, int min_subgraph_size = 3);
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32);
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -162,17 +162,7 @@ struct AnalysisConfig {
 
   /** Transform the AnalysisConfig to NativeConfig.
    */
-  NativeConfig ToNativeConfig() const {
-    NativeConfig config;
-    config.model_dir = model_dir_;
-    config.prog_file = prog_file_;
-    config.param_file = params_file_;
-    config.use_gpu = use_gpu_;
-    config.device = device_id_;
-    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
-    config.specify_input_name = specify_input_name_;
-    return config;
-  }
+  NativeConfig ToNativeConfig() const;
   /** Specify the operator type list to use MKLDNN acceleration.
    * @param op_list the operator type list.
    */
@@ -195,7 +185,8 @@ struct AnalysisConfig {
   /** Turn on memory optimize
    * NOTE still in development, will release latter.
    */
-  void EnableMemoryOptim(bool force_update_cache = false);
+  void EnableMemoryOptim(bool static_optim = false,
+                         bool force_update_static_cache = false);
   /** Tell whether the memory optimization is activated. */
   bool enable_memory_optim() const;
 
@@ -238,10 +229,12 @@ struct AnalysisConfig {
   //  We set this variable to control the minimum number of nodes in the
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
+  Precision tensorrt_precision_mode_;
 
   // memory reuse related.
   bool enable_memory_optim_{false};
-  bool memory_optim_force_update_{false};
+  bool static_memory_optim_{false};
+  bool static_memory_optim_force_update_{false};
 
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
@@ -262,5 +255,4 @@ struct AnalysisConfig {
   mutable std::unique_ptr<PassStrategy> pass_builder_;
 };
 
-}  // namespace contrib
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 46b510fd1ec94c59032b8f41a2ac4d6aa87dc150..8ac8bc529183edc2f8f888ca7ba14611acaadc10 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -215,6 +215,14 @@ class PaddlePredictor {
    */
   virtual ~PaddlePredictor() = default;
 
+  /** \brief Get the serialized model program that executes in inference phase.
+   * Its data type is ProgramDesc, which is a protobuf message.
+   */
+  virtual std::string GetSeriazlizedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  }
+
   /** The common configs for all the predictors.
    */
   struct Config {
@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
 int PaddleDtypeSize(PaddleDType dtype);
 
+std::string get_version();
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index d3a60d209922ebe8d31723ca25c71a952ea08bd6..391932a1ee018c45818457c55fd8f82a22ab7405 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",                    //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_elementwise_add_act_fuse_pass",        //
-        "conv_elementwise_add2_act_fuse_pass",       //
-        "conv_elementwise_add_fuse_pass",            //
+      "infer_clean_graph_pass",                        //
+          "conv_affine_channel_fuse_pass",             //
+          "conv_eltwiseadd_affine_channel_fuse_pass",  //
+          "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+          "conv_elementwise_add_act_fuse_pass",   //
+          "conv_elementwise_add2_act_fuse_pass",  //
+          "conv_elementwise_add_fuse_pass",       //
+#endif
     });
 
     for (int i = 6; i >= 3; i--) {
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 9afeafd176c70bc03166ec7732ae5e2faf67ea54..f4977d08c4d051b8a528e122c47948c3c81d153c 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 01d7f700da9cc67d0ebbd3d9649e3823f58a8811..c5a413221ebff6b9be114151dbb93fd23a148440 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
   // init trt engine
   cudaStream_t stream_;
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
-  engine_->InitNetwork();
   PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
+  engine_->InitNetwork();
 
   engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
                         nvinfer1::Dims3(2, 5, 5));
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f313beb73bb0d21cab1d62859a46fcc76a373548..e83961f3d7bda03a7659f175c59105dcb60708e9 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -78,11 +78,9 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    // create engine.
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
-    engine_->InitNetwork();
-
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_->InitNetwork();
   }
 
   // Declare a Variable as input with random initialization.
@@ -175,7 +173,7 @@ class TRTConvertValidation {
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
-    cudaStreamSynchronize(*engine_->stream());
+    cudaStreamSynchronize(engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
@@ -184,7 +182,7 @@ class TRTConvertValidation {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(*engine_->stream());
+      cudaStreamSynchronize(engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index f739752cbc44805cb0fb3246385609cf16ba744a..10f48462cfaf8073a4f5537d654d614d36b74db4 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
-  PADDLE_ENFORCE_NOT_NULL(stream_);
-  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
-  cudaStreamSynchronize(*stream_);
+  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
+  cudaStreamSynchronize(stream_);
   SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(*stream_);
+  cudaStreamSynchronize(stream_);
   // clean buffer
   for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
@@ -70,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
   // build engine.
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
+  if (enable_int8_) {
+    infer_builder_->setInt8Mode(true);
+    PADDLE_ENFORCE(
+        calibrator_ != nullptr,
+        "The precision mode is 'INT8', the calibrator should not be nullptr");
+    infer_builder_->setInt8Calibrator(calibrator_);
+  }
 
   infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
   PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
@@ -173,7 +179,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, *stream_),
+                                    cudaMemcpyDeviceToDevice, stream_),
                     0);
 }
 
@@ -194,7 +200,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, *stream_));
+                                       cudaMemcpyDeviceToHost, stream_));
 }
 
 Buffer &TensorRTEngine::buffer(const std::string &name) {
@@ -211,12 +217,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, *stream_));
+                                       cudaMemcpyHostToDevice, stream_));
 }
 
 void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
@@ -227,7 +232,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, *stream_));
+                                       cudaMemcpyDeviceToDevice, stream_));
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index f5b2c28ba9e6fefc1d6c14640d696c3bf3ac8249..cdfe09b5a7fd2d1f8548dab9421f671f5a345153 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,12 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+class TRTInt8Calibrator;
 /*
  * TensorRT Engine.
  *
@@ -54,17 +56,17 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr, int device = 0,
+  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
+                 int device = 0, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream ? stream : &default_stream_),
-        logger_(logger),
-        device_(device) {
-    freshDeviceId();
-    cudaStreamCreate(stream_);
-  }
+        stream_(stream),
+        device_(device),
+        enable_int8_(enable_int8),
+        calibrator_(calibrator),
+        logger_(logger) {}
 
   virtual ~TensorRTEngine();
 
@@ -102,7 +104,7 @@ class TensorRTEngine : public EngineBase {
   // NOTE this should be used after calling `FreezeNetwork`.
   Buffer& buffer(const std::string& name) override;
 
-  cudaStream_t* stream() { return stream_; }
+  cudaStream_t stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
   void SetInputFromCPU(const std::string& name, const void* data, size_t size);
@@ -142,8 +144,8 @@ class TensorRTEngine : public EngineBase {
   // In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into
-  // one conv, and then trigger bug. So,  We should use strategy to avoid this
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this
   // optimization for the time being. This bug will be fixed in the future.
   std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
       itensor_quote_num;
@@ -156,11 +158,15 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
 
+  cudaStream_t stream_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
+
+  bool enable_int8_;
+  TRTInt8Calibrator* calibrator_;
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
-  cudaStream_t* stream_;
-  // If stream_ is not set from outside, hold its own stream.
-  cudaStream_t default_stream_;
+
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -169,8 +175,6 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
 
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
   std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
@@ -208,38 +212,6 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
-/*
- * Helper to control the TensorRT engine's creation and deletion.
- */
-class TRT_EngineManager {
- public:
-  bool HasEngine(const std::string& name) const {
-    return engines_.count(name) != 0;
-  }
-
-  // Get an engine called `name`.
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  // Create or get an engine called `name`
-  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name, int gpu_device = 0) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteALl() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index da1f6535cb3b2476cd475797861d6d2bb6d88856..9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,8 +27,8 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
+    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
     engine_->InitNetwork();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a85c8b8fe6d70052edd3be59f98582c9b2e86b9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+
+TRTInt8Calibrator::TRTInt8Calibrator(
+    const std::unordered_map<std::string, size_t>& buffers, int batch_size,
+    std::string engine_name, const platform::Place place)
+    : batch_size_(batch_size), engine_name_(engine_name) {
+  int i = 0;
+  VLOG(4) << "Init a new calibrator: " << engine_name_;
+  for (const auto it : buffers) {
+    framework::Tensor temp_tensor;
+    std::string input_name = it.first;
+    int data_size = it.second;
+    int num_ele = data_size / sizeof(int16_t);
+    framework::DDim data_shape = framework::make_ddim({num_ele});
+    temp_tensor.Resize(data_shape);
+    data_tensors_.push_back(temp_tensor);
+    data_buffers_[input_name] = std::pair<void*, size_t>(
+        static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
+    i += 1;
+  }
+}
+
+TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
+    : batch_size_(0),
+      calib_running_(false),
+      data_is_set_(false),
+      done_(true),
+      calibration_table_(calib_data) {}
+
+void TRTInt8Calibrator::waitAndSetDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
+  if (!done_) {
+    done_ = true;
+    cond_.notify_all();
+  }
+}
+
+// There might be more than one input for trt subgraph,
+// So, we use a map to store input information.
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<std::string, void*>& data) {
+  VLOG(3) << "set batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  //  There is a producer and a consumer. The producer set the batch data and
+  //  the consumer get the batch data. The size of the data pool is one.
+  //  So, the producer has to wait for the consumer to finish processing before
+  //  they can set the data.
+  while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
+  // The done_ is set to true using waitAndSetDone, When all calibration data
+  // are processed.
+  if (done_) return false;
+
+  // Sets the batch.
+  for (const auto& it : data) {
+    auto dataptr = data_buffers_.find(it.first);
+    if (dataptr == data_buffers_.end()) {
+      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    const auto& d = dataptr->second;
+    PADDLE_ENFORCE(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
+        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+  }
+
+  data_is_set_ = true;
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int num_bindings) {
+  VLOG(4) << "get batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  // The consumer has just finished processing a data.
+  // The producer can set the data again.
+  calib_running_ = false;
+  cond_.notify_all();
+
+  // As long as there is data in the pool, the consumer can get it.
+  while (!data_is_set_ && !done_) cond_.wait(lk);
+  if (done_) return false;
+
+  // Gets the batch
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = data_buffers_.find(names[i]);
+    if (it == data_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+    bindings[i] = it->second.first;
+  }
+
+  data_is_set_ = false;
+  calib_running_ = true;
+  VLOG(4) << "get batch done: " << engine_name_;
+  return true;
+}
+
+void TRTInt8Calibrator::setDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  done_ = true;
+  cond_.notify_all();
+}
+
+const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
+  if (calibration_table_.empty()) return nullptr;
+  length = calibration_table_.size();
+  return calibration_table_.data();
+}
+
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
+                                              std::size_t length) {
+  calibration_table_ = std::string((const char*)ptr, length);
+  VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
+          << " length=" << length;
+}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(4) << "Destroying calibrator for " << engine_name_;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
new file mode 100644
index 0000000000000000000000000000000000000000..5815bc9a1464293e0a56f05e34183580eac96cea
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#include <atomic>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TensorRTEngine;
+
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
+                    int batch_size, std::string engine_name,
+                    const platform::Place place);
+
+  explicit TRTInt8Calibrator(const std::string& calibration_data);
+  ~TRTInt8Calibrator();
+
+  int getBatchSize() const override;
+
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) override;
+
+  bool setBatch(const std::unordered_map<std::string, void*>& data);
+  void setDone();
+  void waitAndSetDone();
+
+  const void* readCalibrationCache(std::size_t& length) override;
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  const std::string& getCalibrationTableAsString() {
+    return calibration_table_;
+  }
+
+ private:
+  const int batch_size_;
+
+  bool calib_running_{true};
+  bool data_is_set_{false};
+  bool done_{false};
+
+  std::mutex mut_;
+  std::condition_variable cond_;
+
+  std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
+  std::vector<framework::Tensor> data_tensors_;
+
+  std::string engine_name_;
+  std::string calibration_table_;
+};
+
+class TRTCalibratorEngine {
+ public:
+  TRTCalibratorEngine() {}
+  std::unique_ptr<TRTInt8Calibrator> calib_;
+  std::unique_ptr<std::thread> thr_;
+  std::unique_ptr<TensorRTEngine> engine_;
+};
+/*
+ * Manager to control the TensorRT Int8 calibration creation and deltetion.
+ */
+class TRTCalibratorEngineManager {
+ public:
+  bool Has() const { return res_.size() > 0; }
+  bool Has(const std::string& name) const {
+    if (res_.count(name) == 0) return false;
+    return res_.at(name).get() != nullptr;
+  }
+
+  // Get Int8Calibrator via name
+  TRTCalibratorEngine* Get(const std::string& name) const {
+    return res_.at(name).get();
+  }
+
+  // Look up or create a calibrator.
+  TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
+    if (res_.count(engine_name) == 0) {
+      auto* p = new TRTCalibratorEngine;
+      res_[engine_name].reset(p);
+    }
+    return res_.at(engine_name).get();
+  }
+
+  // Create an Int8Calibrator
+  TRTCalibratorEngine* Create(const std::string& engine_name) {
+    auto* p = new TRTCalibratorEngine;
+    res_[engine_name].reset(p);
+    return p;
+  }
+
+  void DeleteALL() {
+    for (auto& item : res_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 423c39813f05af0d6aaade184914e6777c9b8a83..7ecd9e35332843e3a391cdad5ce32220d890abd1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -54,6 +54,7 @@ else()
     message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
 endif()
 
+
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
+# googlenet
+inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
+  "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
+
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
@@ -123,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
 
+# bert, max_len=20, embedding_dim=128
+set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
+download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL)
+
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # anakin rnn1
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f646fd6d91c81b6738e4fc5278739307fa5f99b5
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -0,0 +1,223 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  if (fields.size() < 5) return false;
+
+  tensors->clear();
+  tensors->reserve(5);
+
+  int i = 0;
+  // src_id
+  paddle::PaddleTensor src_id;
+  ParseTensor<int64_t>(fields[i++], &src_id);
+  tensors->push_back(src_id);
+
+  // pos_id
+  paddle::PaddleTensor pos_id;
+  ParseTensor<int64_t>(fields[i++], &pos_id);
+  tensors->push_back(pos_id);
+
+  // segment_id
+  paddle::PaddleTensor segment_id;
+  ParseTensor<int64_t>(fields[i++], &segment_id);
+  tensors->push_back(segment_id);
+
+  // self_attention_bias
+  paddle::PaddleTensor self_attention_bias;
+  ParseTensor<float>(fields[i++], &self_attention_bias);
+  tensors->push_back(self_attention_bias);
+
+  // next_segment_index
+  paddle::PaddleTensor next_segment_index;
+  ParseTensor<int64_t>(fields[i++], &next_segment_index);
+  tensors->push_back(next_segment_index);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+
+  return true;
+}
+
+void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); }
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig config;
+  SetConfig(&config);
+
+  if (use_mkldnn) {
+    config.EnableMKLDNN();
+  }
+
+  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 inputs, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_bert, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_bert, profile_mkldnn) { profile(true); }
+#endif
+
+// Check the fuse status
+TEST(Analyzer_bert, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  LOG(INFO) << "num_ops: " << num_ops;
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
+}
+
+TEST(Analyzer_bert, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_bert, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+// Compare Deterministic result
+TEST(Analyzer_bert, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       inputs);
+}
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 4ec9404ab42bcd9cc0608f033cb2777106a29583..735e4fb563788438ee49ff6308d11f4dbe4962be 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -19,7 +19,6 @@ DEFINE_int32(max_turn_num, 9,
 
 namespace paddle {
 namespace inference {
-using contrib::AnalysisConfig;
 
 constexpr int32_t kMaxTurnLen = 50;
 
@@ -165,7 +164,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   input_slots->push_back(std::move(response_mask_tensor));
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void SetConfig(AnalysisConfig *cfg) {
   cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim(true);
@@ -187,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   if (use_mkldnn) {
@@ -223,7 +222,7 @@ TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
 
 // Check the fuse status
 TEST(Analyzer_dam, fuse_statis) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -253,17 +252,17 @@ void compare(bool use_mkldnn = false) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig with memory optimization.
-TEST(Analyzer_dam, compare_with_memory_optim) {
+TEST(Analyzer_dam, compare_with_static_memory_optim) {
   // The small dam will core in CI, but works in local.
   if (FLAGS_max_turn_num == 9) {
-    contrib::AnalysisConfig cfg, cfg1;
+    AnalysisConfig cfg, cfg1;
     DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
 
     std::vector<std::vector<PaddleTensor>> input_slots_all;
     SetInput(&input_slots_all);
     // Run the first time to force to update memory cache
     SetConfig(&cfg);
-    cfg.EnableMemoryOptim(true);
+    cfg.EnableMemoryOptim(true, true /*force update*/);
 
     CompareNativeAndAnalysis(
         reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
@@ -271,7 +270,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
 
     // Run second time to use the memory cache and perform memory optimization.
     SetConfig(&cfg1);
-    cfg1.EnableMemoryOptim();
+    cfg1.EnableMemoryOptim(true, false /*do not force update*/);
 
     CompareNativeAndAnalysis(
         reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
@@ -279,6 +278,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
   }
 }
 
+TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim();
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+  }
+}
+
 TEST(Analyzer_dam, compare) { compare(); }
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index b9666e01adb23e0cbd9257bc55081c3a5001e887..347672eaae314aa42096d48a3b044014f2ddbf84 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -18,8 +18,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-using contrib::AnalysisConfig;
-
 struct DataRecord {
   std::vector<int64_t> data;
   std::vector<size_t> lod;
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 529a0174c8542f5226e70ef4a47bde069220ecc2..089f655c180d784af66af60277bdbf32a6019599 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -16,7 +16,6 @@
 
 namespace paddle {
 namespace inference {
-using contrib::AnalysisConfig;
 
 struct DataRecord {
   std::vector<std::vector<int64_t>> query, title;
@@ -75,7 +74,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void SetConfig(AnalysisConfig *cfg) {
   cfg->SetModel(FLAGS_infer_model);
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
@@ -95,7 +94,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
@@ -130,7 +129,7 @@ TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); }
 
 // Check the fuse status
 TEST(Analyzer_MM_DNN, fuse_statis) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -141,7 +140,7 @@ TEST(Analyzer_MM_DNN, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 void compare(bool use_mkldnn = false) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   if (use_mkldnn) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 6fef79dc4608acd6eee679ad4939e7684db98f5b..a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -16,7 +16,6 @@
 
 namespace paddle {
 namespace inference {
-using contrib::AnalysisConfig;
 
 struct DataRecord {
   std::vector<std::vector<int64_t>> word, mention;
@@ -76,7 +75,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
   }
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
+void SetConfig(AnalysisConfig *cfg, bool memory_load = false) {
   if (memory_load) {
     std::string buffer_prog, buffer_param;
     ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
@@ -105,7 +104,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 void profile(bool memory_load = false) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg, memory_load);
   std::vector<PaddleTensor> outputs;
 
@@ -136,7 +135,7 @@ TEST(Analyzer_Chinese_ner, profile_memory_load) {
 
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -152,7 +151,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_Chinese_ner, compare) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index ad2c46e48d5a34a457a615f313f1ac3cc916b200..3f6c933f2bcc6ed5410cb95a48f5ee6869280fe4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -16,7 +16,6 @@
 
 namespace paddle {
 namespace inference {
-using contrib::AnalysisConfig;
 
 struct DataRecord {
   std::vector<std::vector<int64_t>> query_basic, query_phrase, title_basic,
@@ -103,7 +102,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void SetConfig(AnalysisConfig *cfg) {
   cfg->SetModel(FLAGS_infer_model);
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
@@ -123,7 +122,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_Pyramid_DNN, profile) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
@@ -147,7 +146,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
 
 // Check the fuse status
 TEST(Analyzer_Pyramid_DNN, fuse_statis) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -158,7 +157,7 @@ TEST(Analyzer_Pyramid_DNN, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_Pyramid_DNN, compare) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 22e6366fb5cba6c7a0cde9c0c5f50f56c2e23b05..c27c39f40a2067dd2bd2150e4b1e53eab7cdf06e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -20,7 +20,6 @@ namespace paddle {
 namespace inference {
 
 using namespace framework;  // NOLINT
-using namespace contrib;    // NOLINT
 
 struct DataRecord {
   std::vector<std::vector<std::vector<float>>> link_step_data_all;
@@ -223,7 +222,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
   cfg.DisableGpu();
   cfg.SwitchIrDebug();
@@ -237,7 +236,7 @@ TEST(Analyzer_rnn1, profile) {
 
 // Check the fuse status
 TEST(Analyzer_rnn1, fuse_statis) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -254,7 +253,7 @@ TEST(Analyzer_rnn1, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_rnn1, compare) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -276,7 +275,7 @@ TEST(Analyzer_rnn1, compare_determine) {
 
 // Test Multi-Thread.
 TEST(Analyzer_rnn1, multi_thread) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index f3e75ffbb5962885bd926af50b764bec561cc454..ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
-using contrib::AnalysisConfig;
 
 struct Record {
   std::vector<float> data;
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index ecc10bafd650e52dfb73e8dd4329c697ff4f4ccc..b0c23fbd534847c8aad244749761e9c072148796 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -58,9 +58,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   return os;
 }
 
-std::ostream &operator<<(std::ostream &os,
-                         const contrib::AnalysisConfig &config) {
-  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
+std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
+  os << GenSpaces(num_spaces) << "AnalysisConfig {\n";
   num_spaces++;
   os << config.ToNativeConfig();
   if (!config.model_from_memory()) {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d2ca1d0b0098fd377725cc0fdf002d8d2e3539ec..2811eb4946ea025cf6c7ab197c4e603df86f6f2d 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -58,7 +58,7 @@ namespace inference {
 
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
   const auto *analysis_config =
-      reinterpret_cast<const contrib::AnalysisConfig *>(config);
+      reinterpret_cast<const AnalysisConfig *>(config);
   if (use_analysis) {
     LOG(INFO) << *analysis_config;
     return;
@@ -102,9 +102,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
     const PaddlePredictor::Config *config, bool use_analysis = true) {
   const auto *analysis_config =
-      reinterpret_cast<const contrib::AnalysisConfig *>(config);
+      reinterpret_cast<const AnalysisConfig *>(config);
   if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config);
+    return CreatePaddlePredictor<AnalysisConfig>(*analysis_config);
   }
   auto native_config = analysis_config->ToNativeConfig();
   return CreatePaddlePredictor<NativeConfig>(native_config);
@@ -139,7 +139,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                        const std::string &dirname, bool is_combined = true,
                        std::string model_filename = "model",
                        std::string params_filename = "params",
-                       const std::vector<std::string> *feed_names = nullptr) {
+                       const std::vector<std::string> *feed_names = nullptr,
+                       const int continuous_inuput_index = 0) {
   // Set fake_image_data
   PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
   std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
@@ -176,7 +177,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     float *input_data = static_cast<float *>(input.data.data());
     // fill input data, for profile easily, do not use random data here.
     for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) = static_cast<float>(j) / len;
+      *(input_data + j) =
+          static_cast<float>((j + continuous_inuput_index) % len) / len;
     }
   }
   (*inputs).emplace_back(input_slots);
@@ -344,6 +346,16 @@ void CompareNativeAndAnalysis(
   CompareResult(analysis_outputs, native_outputs);
 }
 
+void CompareNativeAndAnalysis(
+    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  native_pred->Run(inputs[0], &native_outputs, batch_size);
+  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 5aca807ee3aee1bb323abe6d5c3700dfc08e30b4..17a433c9d98768dbda4ba93bdceb6cc1717adc07 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -42,9 +42,9 @@ void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
 }
 
 template <>
-void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
-                                        std::string model_dir, bool use_gpu,
-                                        bool use_tensorrt, int batch_size) {
+void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
+                               bool use_gpu, bool use_tensorrt,
+                               int batch_size) {
   if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
     config->SetModel(model_dir + "/" + FLAGS_prog_filename,
                      model_dir + "/" + FLAGS_param_filename);
@@ -75,11 +75,11 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
 
   std::vector<PaddleTensor> outputs;
   if (use_analysis || use_tensorrt) {
-    contrib::AnalysisConfig config;
+    AnalysisConfig config;
     config.EnableUseGpu(100, 0);
     config.pass_builder()->TurnOnDebug();
-    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
-                                       FLAGS_batch_size);
+    SetConfig<AnalysisConfig>(&config, model_dir, true, use_tensorrt,
+                              FLAGS_batch_size);
     TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
                    inputs_all, &outputs, FLAGS_num_threads, true);
   } else {
@@ -99,14 +99,36 @@ void compare(std::string model_dir, bool use_tensorrt) {
     SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
   }
 
-  contrib::AnalysisConfig analysis_config;
-  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
-                                     use_tensorrt, FLAGS_batch_size);
+  AnalysisConfig analysis_config;
+  SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
+                            FLAGS_batch_size);
   CompareNativeAndAnalysis(
       reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
       inputs_all);
 }
 
+void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
+  AnalysisConfig analysis_config;
+  SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
+                            FLAGS_batch_size);
+  auto config =
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
+  auto native_pred = CreateTestPredictor(config, false);
+  auto analysis_pred = CreateTestPredictor(config, true);
+  for (int i = 0; i < 100; i++) {
+    std::vector<std::vector<PaddleTensor>> inputs_all;
+    if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+      SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                        FLAGS_param_filename, nullptr, i);
+    } else {
+      SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "", nullptr,
+                        i);
+    }
+    CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
+                             inputs_all);
+  }
+}
+
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
@@ -162,5 +184,15 @@ TEST(TensorRT_mobilenet, profile) {
   profile(model_dir, true, false);
 }
 
+TEST(resnet50, compare_continuous_input) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, true);
+}
+
+TEST(resnet50, compare_continuous_input_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 64aa63ffe9705d75e70c8d9d9cbc433dd6358596..8759ec8096cf102ab85d2c2a91eddc23a6ed0e50 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -13,8 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
+
 #include <string>
+#include <utility>
 #include <vector>
+
+#ifdef PADDLE_WITH_JEMALLOC
+#include <jemalloc/jemalloc.h>
+#endif
+
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
@@ -37,7 +44,7 @@ template <typename Place>
 void *Alloc(const Place &place, size_t size);
 
 template <typename Place>
-void Free(const Place &place, void *p);
+void Free(const Place &place, void *p, size_t size);
 
 template <typename Place>
 size_t Used(const Place &place);
@@ -52,6 +59,11 @@ size_t memory_usage(const platform::Place &p);
 
 using BuddyAllocator = detail::BuddyAllocator;
 
+std::unordered_map</*device id*/ int,
+                   std::pair</*current memory usage*/ uint64_t,
+                             /*peak memory usage*/ uint64_t>>
+    gpu_mem_info;
+
 BuddyAllocator *GetCPUBuddyAllocator() {
   // We tried thread_local for inference::RNN1 model, but that not works much
   // for multi-thread test.
@@ -89,7 +101,11 @@ struct NaiveAllocator {
 template <>
 void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+#ifdef PADDLE_WITH_JEMALLOC
+  void *p = malloc(size);
+#else
   void *p = GetCPUBuddyAllocator()->Alloc(size);
+#endif
   if (FLAGS_init_allocated_mem) {
     memset(p, 0xEF, size);
   }
@@ -98,14 +114,24 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
+void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+                              size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+#ifdef PADDLE_WITH_JEMALLOC
+  free(p);
+#else
   GetCPUBuddyAllocator()->Free(p);
+#endif
 }
 
 template <>
 size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
+#ifdef PADDLE_WITH_JEMALLOC
+  // fake the result of used memory when PADDLE_WITH_JEMALLOC is ON
+  return 0U;
+#else
   return GetCPUBuddyAllocator()->Used();
+#endif
 }
 
 #ifdef PADDLE_WITH_CUDA
@@ -177,9 +203,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     LOG(WARNING) << "GPU memory used: "
                  << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
-  }
-  if (FLAGS_init_allocated_mem) {
-    cudaMemset(ptr, 0xEF, size);
+  } else {
+    gpu_mem_info[place.device].first += size;
+    if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
+      gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
+      VLOG(3) << "device: " << place.device << " peak memory usage : "
+              << (gpu_mem_info[place.device].second >> 20) << " MiB";
+    }
+    if (FLAGS_init_allocated_mem) {
+      cudaMemset(ptr, 0xEF, size);
+    }
   }
   return ptr;
 #else
@@ -188,9 +221,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) {
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
+  gpu_mem_info[place.device].first -= size;
 #else
   PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -243,7 +278,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p) {
+                                     void *p, size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -264,15 +299,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
 };
 
 struct FreeVisitor : public boost::static_visitor<void> {
-  inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {}
+  inline explicit FreeVisitor(void *ptr, size_t size)
+      : ptr_(ptr), size_(size) {}
 
   template <typename Place>
   inline void operator()(const Place &place) const {
-    Free<Place>(place, ptr_);
+    Free<Place>(place, ptr_, size_);
   }
 
  private:
   void *ptr_;
+  size_t size_;
 };
 
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
@@ -304,8 +341,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
 }
 
 void LegacyAllocator::Free(Allocation *allocation) {
-  boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()),
-                       allocation->place());
+  boost::apply_visitor(
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
+      allocation->place());
   delete allocation;
 }
 }  // namespace allocation
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 992a2bdd5ad639bf6176328e94da6eb71a41790c..e099425b94221bf1229e936fc1781615d13dbc26 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
 add_subdirectory(metrics)
+add_subdirectory(ngraph)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 9c5b8604f40ae56c463b54c71623feb61bd8d297..7ec9d2fed53c9c73952db7dcdfc2d8e634f3f84e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 2bebdb345ab324eb0a2dafd54c74833dd21bdb6d..c054fdb1ba6e5ae5970a51ac9f071f6ef535a4b5 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
     T* dbias) {
   const int outer_size = C;
   const int inner_size = N * HxW;
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ds_storage;
   __shared__ typename BlockReduce::TempStorage db_storage;
 
@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
       ds_sum += dy[index] * x[index];
       db_sum += dy[index];
     }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    __syncthreads();
+    auto ds_out =
+        BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
+    auto db_out =
+        BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
     if (threadIdx.x == 0) {
-      dscale[i] = ds_sum;
-      dbias[i] = db_sum;
+      dscale[i] = ds_out;
+      dbias[i] = db_out;
     }
-    __syncthreads();
   }
 }
 
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 30f700f1d91c5a81f39594b6dab7e5e717c9818f..e93cd8615e052e4dfc6255549bf7a9b84b7dd657 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <map>
+#include "paddle/fluid/operators/beam_search_op.h"
+
 #include <string>
 #include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
-void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
-                            const framework::LoDTensor &pre_scores,
-                            framework::LoDTensor *selected_ids,
-                            framework::LoDTensor *selected_scores) {
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-
-  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
-  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
-  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
-    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  PruneEndBeams(pre_ids, &selected_items);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items), std::end(selected_items), 0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = framework::make_ddim(
-      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-
-  std::map<size_t /*offset*/, std::vector<Item>> hash;
-  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
-  auto *scores_data =
-      selected_scores->mutable_data<float>(platform::CPUPlace());
-
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    for (auto &item : items) {
-      ids_data[low_offset] = item.id;
-      scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-
-  // fill lod
-  framework::LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  if (!framework::CheckLoD(lod)) {
-    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-  }
-  selected_ids->set_lod(lod);
-  selected_scores->set_lod(lod);
-}
-
-void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
-                               std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    size_t src_prefix_start = high_level[src_idx];
-    size_t src_prefix_end = high_level[src_idx + 1];
-    bool finish_flag = true;
-    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      for (auto &item : items->at(offset)) {
-        if (item.id != static_cast<size_t>(end_id_) ||
-            pre_ids_data[offset] != end_id_) {
-          finish_flag = false;
-          break;
-        }
-      }
-      if (!finish_flag) break;
-    }
-    if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                        // prune this beam
-      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
-        items->at(offset).clear();
-    }
-  }
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
-    const framework::LoDTensor &pre_ids,
-    const framework::LoDTensor &pre_scores) {
-  std::vector<std::vector<Item>> result;
-  std::vector<Item> items;
-  // for each source sentence, select the top beam_size items across all
-  // candidate sets.
-  while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(
-        std::begin(items), std::begin(items) + beam_size_, std::end(items),
-        [](const Item &a, const Item &b) { return a.score > b.score; });
-    // prune the top beam_size items.
-    if (items.size() > beam_size_) {
-      items.resize(beam_size_);
-    }
-    result.emplace_back(items);
-  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-  for (auto &items : result) {
-    VLOG(3) << "item set:";
-    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  return result;
-}
-
-// the candidates of a source
-bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
-                             const framework::LoDTensor &pre_scores,
-                             std::vector<BeamSearch::Item> *items) {
-  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
-    return false;
-  }
-  // find the current candidates
-  auto ids = *ids_;
-  auto scores = *scores_;
-
-  auto abs_lod = framework::ToAbsOffset(ids.lod());
-
-  auto *ids_data = ids.data<int64_t>();
-  auto *scores_data = scores.data<float>();
-
-  size_t instance_dim = 1;
-  for (int i = 1; i < ids.dims().size(); i++) {
-    instance_dim *= ids.dims()[i];
-  }
-
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto *pre_scores_data = pre_scores.data<float>();
-  items->clear();
-  items->reserve(framework::product(ids.dims()));
-  for (size_t offset = abs_lod[lod_level_][sent_offset_];
-       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    auto pre_id = pre_ids_data[offset];
-    auto pre_score = pre_scores_data[offset];
-    if (pre_id == end_id_) {
-      // Allocate all probability mass to eos_id for finished branchs and the
-      // other candidate ids can be ignored.
-      items->emplace_back(offset, end_id_, pre_score);
-    } else {
-      for (size_t d = 0; d < instance_dim; d++) {
-        const size_t dim_offset = offset * instance_dim + d;
-        items->emplace_back(offset, ids_data[dim_offset],
-                            scores_data[dim_offset]);
-      }
-    }
-  }
-
-  sent_offset_++;
-  return true;
-}
-
-std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
-  os << "{";
-  os << "offset: " << item.offset << ", ";
-  os << "id: " << item.id << ", ";
-  os << "score: " << item.score << "";
-  os << "}";
-
-  return os;
-}
-
-std::string ItemToString(const BeamSearch::Item &item) {
-  std::ostringstream stream;
-  stream << item;
-  return stream.str();
-}
-
 class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -219,29 +29,40 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) The LoDTensor containing the selected ids at the "
              "previous step. It should be a tensor with shape (batch_size, 1) "
              "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
-             "thefirst step.");
+             "the first step.");
     AddInput("pre_scores",
              "(LoDTensor) The LoDTensor containing the accumulated "
              "scores corresponding to the selected ids at the previous step.");
     AddInput("ids",
              "(LoDTensor) The LoDTensor containing the candidates ids. Its "
-             "shape should be (batch_size * beam_size, K), where K supposed to "
-             "be beam_size.");
+             "shape should be (batch_size * beam_size, W). If not set, it will "
+             "be calculated out according to Input(scores) in this operator.")
+        .AsDispensable();
     AddInput("scores",
-             "(LoDTensor) The LodTensor containing the accumulated scores "
-             "corresponding to Input(ids) and its shape is the same as the "
-             "shape of Input(ids).");
+             "(LoDTensor) The LoDTensor containing the current scores "
+             "corresponding to Input(ids). If Input(ids) is not nullptr, its "
+             "shape is the same as that of Input(ids)."
+             "If is_accumulated is true, Input(scores) is accumulated scores "
+             "and will be used derectedly. Else, each score will be "
+             "transformed to the log field and accumulate Input(pre_sores) "
+             "first.");
     AddOutput("selected_ids",
               "A LodTensor that stores the IDs selected by beam search.");
     AddOutput("selected_scores",
               "A LoDTensor containing the accumulated scores corresponding to "
               "Output(selected_ids).");
+    AddOutput(
+        "parent_idx",
+        "A Tensor preserving the selected_ids' parent indice in pre_ids.");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
     AddAttr<int>("beam_size", "beam size for beam search");
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
+    AddAttr<bool>("is_accumulated",
+                  "Whether the Input(scores) is accumulated scores.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
 This operator does the search in beams for one time step. 
@@ -265,10 +86,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+         std::vector<std::string>({"pre_ids", "scores"})) {
       PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
                      arg);
     }
@@ -279,12 +99,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
     }
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
-        platform::CPUPlace());
-    return kt;
+    auto *scores = ctx.Input<framework::LoDTensor>("scores");
+    size_t level = ctx.Attr<int>("level");
+    size_t batch_size = scores->lod()[level].size() - 1;
+    // The current CUDA kernel only support cases with batch_size < 4.
+    // Compute on CPU for cases with batch_size > 4.
+    if (batch_size <= 4) {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
+    } else {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(),
+          platform::CPUPlace());
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef9476eee5d3fac4decd7273da824b2f2349199
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b5e2ed05924cc8b7bc06058b9b1103ba10be486e..f808020cc765585d1633c6c3bf528080a7e83f07 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,187 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/beam_search.h"
 
 namespace paddle {
 namespace operators {
 
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- * Input
- *
- * ids:
- * LoD (should have 2 levels)
- * first level: [0, 1, 4]
- * second level: [0, 1, 2, 3, 4]
- *
- * tensor's data
- * [
- * [4, 2, 5]
- * [2, 1, 3]
- * [3, 5, 2]
- * [8, 2, 1]
- * ]
- *
- * scores:
- * LoD same as `ids`
- * tensor's data
- * [
- * [0.5, 0.3, 0.2]
- * [0.6, 0.3, 0.1]
- * [0.9, 0.5, 0.1]
- * [0.7, 0.5, 0.1]
- * ]
- *
- * the inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * lets assume beam size is 2, and the beam search's output should be
- * LoD
- * first level:
- * [0, 1, 2]
- * second level:
- * [0, 2, 4]
- *
- * id tensor's data
- * [[
- * 4,
- * 1,
- * 3,
- * 8,
- * ]]
- *
- * score tensor's data
- * [[
- * 0.5,
- * 0.3,
- * 0.9,
- * 0.7
- * ]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-class BeamSearch {
- public:
-  // TODO(superjom) make type customizable
-  using id_t = size_t;
-  using score_t = float;
-  /*
-   * Input the arguments that needed by this class.
-   */
-  BeamSearch(const framework::LoDTensor& ids,
-             const framework::LoDTensor& scores, size_t level, size_t beam_size,
-             int end_id)
-      : beam_size_(beam_size),
-        ids_(&ids),
-        scores_(&scores),
-        lod_level_(level),
-        end_id_(end_id) {}
-
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1]
-   *   -  [0 1 2]]
-   *   - [[]
-   *   -  [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the
-   *   source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const framework::LoDTensor& pre_ids,
-                  const framework::LoDTensor& pre_scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores);
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    id_t id;
-    // the corresponding score
-    score_t score;
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor& pre_ids,
-                     std::vector<std::vector<Item>>* items);
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs, size_t element_num);
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor& pre_ids,
-      const framework::LoDTensor& pre_scores);
-
-  /*
-   * Get the items of next source sequence, return false if no remaining items.
-   */
-  bool NextItemSet(const framework::LoDTensor& pre_ids,
-                   const framework::LoDTensor& pre_scores,
-                   std::vector<Item>* items);
-
- private:
-  size_t beam_size_;
-  const framework::LoDTensor* ids_;
-  const framework::LoDTensor* scores_;
-  size_t lod_level_{0};
-  size_t sent_offset_{0};
-  int end_id_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
-
-std::string ItemToString(const BeamSearch::Item& item);
-
 template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto* scores = context.Input<framework::LoDTensor>("scores");
     auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
     auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-    PADDLE_ENFORCE_NOT_NULL(ids);
+
     PADDLE_ENFORCE_NOT_NULL(scores);
     PADDLE_ENFORCE_NOT_NULL(pre_ids);
     PADDLE_ENFORCE_NOT_NULL(pre_scores);
@@ -211,14 +36,22 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    bool is_accumulated = context.Attr<bool>("is_accumulated");
+
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
+    auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
     PADDLE_ENFORCE_NOT_NULL(selected_ids);
     PADDLE_ENFORCE_NOT_NULL(selected_scores);
-    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
+    PADDLE_ENFORCE_NOT_NULL(parent_idx);
+
+    math::BeamSearchFunctor<DeviceContext, T> alg;
+    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
+        ids, scores, selected_ids, selected_scores, parent_idx, level,
+        beam_size, end_id, is_accumulated);
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
deleted file mode 100644
index 40b46781daa989fcd89887a3c01e97e39ea71255..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_op.h"
-
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace test {
-
-using std::vector;
-using framework::LoDTensor;
-using framework::LoD;
-using operators::BeamSearch;
-using paddle::platform::CPUPlace;
-using std::cout;
-using std::endl;
-
-void CreateInput(LoDTensor* ids, LoDTensor* scores) {
-  LoD lod;
-  vector<size_t> level0({0, 2, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
-  ids->Resize(dims);
-  scores->Resize(dims);
-  CPUPlace place;
-
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-
-  for (int i = 0; i < 12; i++) {
-    ids_data[i] = _ids[i];
-    scores_data[i] = _scores[i];
-  }
-}
-
-// It seems that beam_search_op has bugs.
-TEST(DISABLED_beam_search_op, run) {
-  CPUPlace place;
-  LoDTensor ids, scores;
-  CreateInput(&ids, &scores);
-
-  LoDTensor pre_ids;
-  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
-  }
-  LoDTensor pre_scores;
-  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-
-  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
-  LoDTensor sids, sscores;
-  beamsearch(pre_ids, pre_scores, &sids, &sscores);
-
-  LOG(INFO) << "score: " << sscores << endl;
-
-  ASSERT_EQ(sids.lod(), sscores.lod());
-
-  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
-
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
-    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
-  }
-}
-
-}  // namespace test
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index e223be7af82146e7c69c7c5aab8f08d0fe0d1710..f9570e4e2ed0d9ac8739410eb7cd7397ad09fae4 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
     auto* label = ctx.Input<Tensor>("Label");
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const int step_size = x->dims()[0];
-    const int num_classes = x->dims()[1];
+    const size_t step_size = static_cast<size_t>(x->dims()[0]);
+    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
     const T* dy_data = dy->data<T>();
     const T* x_data = x->data<T>();
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index f97ebecfdd90beade3bef824c04ad7b2763eb036..d8b997cca613f660046106512fc03bf55f9b992d 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
     auto handle = dev_ctx.cudnn_handle();
-
-    Tensor cudnn_workspace;
-    void* cudnn_workspace_ptr = nullptr;
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
     CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_limit)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-
       auto search_func = [&]() {
         int returned_algo_count;
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
-
-        CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-            handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-            filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
-            kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(),
-            cudnn_workspace_ptr, workspace_size_limit));
-
+        auto cudnn_find_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
+        };
+        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
         VLOG(3) << "Perf result: (algo: stat, time, memory)";
         for (int i = 0; i < returned_algo_count; ++i) {
           const auto& stat = fwd_perf_stat[i];
@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    if (!cudnn_workspace_ptr) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_in_bytes)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
-
     if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // cudnnConvolutionForward and cudnnAddTensor
       // ------------- cudnn conv forward and bias add ---------------------
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
-          workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
-
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
       CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
           handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // ------------------- cudnn conv+bias+act forward --------------------
       ScalingParamType<T> alpha1 = 1.0f;
       ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
-          output_data));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
     if (channels.size()) {
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 016cf8448c5e07fdedab8c5e4a7d0ae9e2ded1ee..f44094ca6b7b7f23f2e7593ad79e4e2a6f0d3070 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int output_offset = output->numel() / output->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
-
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     for (int g = 0; g < groups; g++) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
-          cudnn_output_desc, output_data + output_offset * g));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_output_desc, output_data + output_offset * g));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
   }
 };
@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         output_grad->numel() / output_grad->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
-
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + input_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+              input_grad_data + input_offset * g));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
       }
     }
 
@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_input_desc,
-            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + filter_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_input_desc,
+              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + filter_offset * g));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
       }
     }
   }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6c85f1577e0c49d00f4ccf7fa7be0974eb62bdf3..cace42bc1bae93287c330e54d12126efbf9a14bb 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
+detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 
 if(WITH_GPU)
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
@@ -45,3 +46,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op
 foreach(src ${LOCAL_DETECTION_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
 endforeach()
+
+cc_library(mask_util SRCS mask_util.cc DEPS memory)
+cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
+detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 6abeca1da443248d6ad3c1bcc64dd775d77f4ed8..b99edb5bf05f94e762b377a8882e4c3fcdb5afad 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -1,13 +1,17 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_w = std::max(x_max - x_min + 1, zero);
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+      overlaps_et(i, j) =
+          (inter_area == 0.) ? 0 : inter_area /
+                                       (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 06fbb9815c52ea69e3aa9e893512e039853b9514..fdcff62e1fe59b3a2f4925bdff98632f71220abb 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_coder_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel {
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                        "The rank of Input of PriorBoxVar must be 2");
+                        "The rank of Input PriorBox must be 2");
       PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
                         "The shape of PriorBox is [N, 4]");
       if (ctx->HasInput("PriorBoxVar")) {
         auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-        PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+        PADDLE_ENFORCE(
+            prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
+            "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
+        if (prior_box_var_dims.size() == 1) {
+          PADDLE_ENFORCE_EQ(
+              prior_box_var_dims[0], 4,
+              "The 1st dimension of Input(PriorBoxVar) should be 4"
+              "when the rank is 1.");
+        } else {
+          PADDLE_ENFORCE_EQ(
+              prior_box_dims, prior_box_var_dims,
+              "The dimension of Input(PriorBoxVar) should be equal to"
+              "the dimension of Input(PriorBox when the rank is 2.)");
+        }
       }
+    }
 
-      auto code_type =
-          GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
-      if (code_type == BoxCodeType::kEncodeCenterSize) {
-        PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                          "The rank of Input of TargetBox must be 2");
-        PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                          "The shape of TargetBox is [M, 4]");
-      } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-        PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
-                          "The rank of Input of TargetBox must be 3");
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    int axis = ctx->Attrs().Get<int>("axis");
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+      ctx->SetOutputDim(
+          "OutputBox",
+          framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input TargetBox must be 3");
+      if (axis == 0) {
         PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+      } else if (axis == 1) {
+        PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+      } else {
+        PADDLE_THROW("axis must be 0 or 1.");
       }
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+      ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
+    }
+
+    if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) {
+      ctx->ShareLoD("PriorBox", /*->*/ "OutputBox");
+    } else {
+      ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
     }
-    ctx->SetOutputDim(
-        "OutputBox",
-        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
-    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
   }
 };
 
@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default true) "
                   "whether treat the priorbox as a noramlized box")
         .SetDefault(true);
+    AddAttr<int>("axis",
+                 "(int, default 0)"
+                 "which axis in PriorBox to broadcast for box decode,"
+                 "for example, if axis is 0 and TargetBox has shape"
+                 "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox "
+                 "will broadcast to [N, M, 4] for decoding. It is only valid"
+                 "when code type is decode_center_size")
+        .SetDefault(0)
+        .InEnum({0, 1});
+    AddAttr<std::vector<float>>(
+        "variance",
+        "(vector<float>, default {}),"
+        "variance of prior box with shape [4]. PriorBoxVar and variance can"
+        "not be provided at the same time.")
+        .SetDefault(std::vector<float>{});
     AddOutput("OutputBox",
               "(LoDTensor or Tensor) "
               "When code_type is 'encode_center_size', the output tensor of "
@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
-encoded/decoded coordinates, width and height.
+encoded/decoded coordinates, width and height. 
+
+During Box Decoding, two modes for broadcast are supported. Say target box has 
+shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
+box will broadcast to target box along the assigned axis. 
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index a7af111f63d654319dd1d90d2032956951dfe49e..e078af3eb478a8bebc6a7fc6460d169d803a3c4b 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -9,6 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -16,11 +19,11 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       const bool normalized, T* output) {
+__global__ void EncodeCenterSizeKernel(
+    const T* prior_box_data, const T* prior_box_var_data,
+    const T* target_box_data, const int row, const int col, const int len,
+    const bool normalized, const T prior_box_var_size, const float* variance,
+    const int var_size, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
@@ -30,11 +33,9 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     T prior_box_height = prior_box_data[col_idx * len + 3] -
                          prior_box_data[col_idx * len + 1] +
                          (normalized == false);
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
+    T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2;
+    T prior_box_center_y =
+        prior_box_data[col_idx * len + 1] + prior_box_height / 2;
 
     T target_box_center_x =
         (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
@@ -55,58 +56,73 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
     output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
     if (prior_box_var_data) {
-      output[idx * len] /= prior_box_var_data[col_idx * len];
-      output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1];
-      output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2];
-      output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3];
+      int prior_var_offset = 0;
+      if (prior_box_var_size == 2) {
+        prior_var_offset = col_idx * len;
+      }
+      output[idx * len] /= prior_box_var_data[prior_var_offset];
+      output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
+      output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
+      output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3];
+    } else if (var_size == 4) {
+      for (int k = 0; k < 4; ++k) {
+        output[idx * len + k] /= static_cast<T>(variance[k]);
+      }
     }
   }
 }
 
 template <typename T>
-__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       const bool normalized, T* output) {
+__global__ void DecodeCenterSizeKernel(
+    const T* prior_box_data, const T* prior_box_var_data,
+    const T* target_box_data, const int row, const int col, const int len,
+    const bool normalized, const T prior_box_var_size, const float* variance,
+    const int var_size, const int axis, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int prior_box_offset = 0;
   if (idx < row * col) {
     const int col_idx = idx % col;
-    T prior_box_width = prior_box_data[col_idx * len + 2] -
-                        prior_box_data[col_idx * len] + (normalized == false);
-    T prior_box_height = prior_box_data[col_idx * len + 3] -
-                         prior_box_data[col_idx * len + 1] +
+    const int row_idx = idx / col;
+    prior_box_offset = axis == 0 ? col_idx * len : row_idx * len;
+    T prior_box_width = prior_box_data[prior_box_offset + 2] -
+                        prior_box_data[prior_box_offset] +
+                        (normalized == false);
+    T prior_box_height = prior_box_data[prior_box_offset + 3] -
+                         prior_box_data[prior_box_offset + 1] +
                          (normalized == false);
     T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
+        prior_box_data[prior_box_offset] + prior_box_width / 2;
+    T prior_box_center_y =
+        prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
     T target_box_width, target_box_height;
     T target_box_center_x, target_box_center_y;
+    T box_var_x = T(1), box_var_y = T(1);
+    T box_var_w = T(1), box_var_h = T(1);
     if (prior_box_var_data) {
-      target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[idx * len + 2]) *
-                         prior_box_width;
-      target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[idx * len + 3]) *
-                          prior_box_height;
-      target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[idx * len] * prior_box_width +
-                            prior_box_center_x;
-      target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[idx * len + 1] *
-                                prior_box_height +
-                            prior_box_center_y;
-    } else {
-      target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
-      target_box_height =
-          exp(target_box_data[idx * len + 3]) * prior_box_height;
-      target_box_center_x =
-          target_box_data[idx * len] * prior_box_width + prior_box_center_x;
-      target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
-                            prior_box_center_y;
+      int prior_var_offset = 0;
+      if (prior_box_var_size == 2) {
+        prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
+      }
+      box_var_x = prior_box_var_data[prior_var_offset];
+      box_var_y = prior_box_var_data[prior_var_offset + 1];
+      box_var_w = prior_box_var_data[prior_var_offset + 2];
+      box_var_h = prior_box_var_data[prior_var_offset + 3];
+    } else if (var_size == 4) {
+      box_var_x = static_cast<T>(variance[0]);
+      box_var_y = static_cast<T>(variance[1]);
+      box_var_w = static_cast<T>(variance[2]);
+      box_var_h = static_cast<T>(variance[3]);
     }
+    target_box_width =
+        exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width;
+    target_box_height =
+        exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height;
+    target_box_center_x =
+        box_var_x * target_box_data[idx * len] * prior_box_width +
+        prior_box_center_x;
+    target_box_center_y =
+        box_var_y * target_box_data[idx * len + 1] * prior_box_height +
+        prior_box_center_y;
 
     output[idx * len] = target_box_center_x - target_box_width / 2;
     output[idx * len + 1] = target_box_center_y - target_box_height / 2;
@@ -127,36 +143,64 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
+    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
     const T* prior_box_data = prior_box->data<T>();
     const T* target_box_data = target_box->data<T>();
     const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+    auto prior_box_var_size = 0;
+    if (prior_box_var) {
+      PADDLE_ENFORCE(variance.empty(),
+                     "Input 'PriorBoxVar' and attribute 'variance' should not"
+                     "be used at the same time.");
+      prior_box_var_data = prior_box_var->data<T>();
+      prior_box_var_size = prior_box_var->dims().size();
+    }
+    if (!(variance.empty())) {
+      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
+                     "Size of attribute 'variance' should be 4");
+    }
 
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                         "Only support 1 level of LoD.");
     }
+    const int var_size = static_cast<int>(variance.size());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+    int axis = context.Attr<int>("axis");
+
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    if (code_type == BoxCodeType::kDecodeCenterSize) {
+      col = target_box->dims()[1];
+    }
     auto len = prior_box->dims()[1];
     int block = 512;
     int grid = (row * col + block - 1) / block;
     auto& device_ctx = context.cuda_device_context();
 
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(device_ctx);
+    int bytes = var_size * sizeof(float);
+    auto dev_var = allocator.Allocate(bytes);
+    float* dev_var_data = reinterpret_cast<float*>(dev_var->ptr());
+    auto cplace = platform::CPUPlace();
+    const auto gplace = boost::get<platform::CUDAPlace>(context.GetPlace());
+    memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes,
+                 device_ctx.stream());
+
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
     T* output = output_box->data<T>();
 
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, output);
+          normalized, prior_box_var_size, dev_var_data, var_size, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, output);
+          normalized, prior_box_var_size, dev_var_data, var_size, axis, output);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index b2a2bcdce932032a761a1fc064fe622f7629f9bf..a0b1faf7bdc7001eba2d92b4d03fbaf9feb7bcbb 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -34,7 +35,8 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void EncodeCenterSize(const framework::Tensor* target_box,
                         const framework::Tensor* prior_box,
                         const framework::Tensor* prior_box_var,
-                        const bool normalized, T* output) const {
+                        const bool normalized,
+                        const std::vector<float> variance, T* output) const {
     int64_t row = target_box->dims()[0];
     int64_t col = prior_box->dims()[0];
     int64_t len = prior_box->dims()[1];
@@ -53,10 +55,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
         T prior_box_height = prior_box_data[j * len + 3] -
                              prior_box_data[j * len + 1] +
                              (normalized == false);
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
         T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+            prior_box_data[j * len + 1] + prior_box_height / 2;
 
         T target_box_center_x =
             (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
@@ -78,10 +79,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
         output[offset + 3] =
             std::log(std::fabs(target_box_height / prior_box_height));
         if (prior_box_var) {
-          output[offset] /= prior_box_var_data[j * len];
-          output[offset + 1] /= prior_box_var_data[j * len + 1];
-          output[offset + 2] /= prior_box_var_data[j * len + 2];
-          output[offset + 3] /= prior_box_var_data[j * len + 3];
+          int prior_var_offset = 0;
+          if (prior_box_var->dims().size() == 2) {
+            prior_var_offset = j * len;
+          }
+          output[offset] /= prior_box_var_data[prior_var_offset];
+          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
+          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
+          output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
+        } else if (!(variance.empty())) {
+          for (int k = 0; k < 4; ++k) {
+            output[offset + k] /= static_cast<T>(variance[k]);
+          }
         }
       }
     }
@@ -89,58 +98,71 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void DecodeCenterSize(const framework::Tensor* target_box,
                         const framework::Tensor* prior_box,
                         const framework::Tensor* prior_box_var,
-                        const bool normalized, T* output) const {
+                        const bool normalized, const int axis,
+                        const std::vector<float> variance, T* output) const {
     int64_t row = target_box->dims()[0];
-    int64_t col = prior_box->dims()[0];
-    int64_t len = prior_box->dims()[1];
+    int64_t col = target_box->dims()[1];
+    int64_t len = target_box->dims()[2];
 
     auto* target_box_data = target_box->data<T>();
     auto* prior_box_data = prior_box->data<T>();
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
-
+    int prior_box_offset = 0;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
-        T prior_box_width = prior_box_data[j * len + 2] -
-                            prior_box_data[j * len] + (normalized == false);
-        T prior_box_height = prior_box_data[j * len + 3] -
-                             prior_box_data[j * len + 1] +
+        if (axis == 0) {
+          prior_box_offset = j * len;
+        } else if (axis == 1) {
+          prior_box_offset = i * len;
+        }
+        T prior_box_width = prior_box_data[prior_box_offset + 2] -
+                            prior_box_data[prior_box_offset] +
+                            (normalized == false);
+        T prior_box_height = prior_box_data[prior_box_offset + 3] -
+                             prior_box_data[prior_box_offset + 1] +
                              (normalized == false);
         T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+            prior_box_data[prior_box_offset] + prior_box_width / 2;
         T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+            prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
 
         T target_box_center_x = 0, target_box_center_y = 0;
         T target_box_width = 0, target_box_height = 0;
+        T box_var_x = T(1), box_var_y = T(1);
+        T box_var_w = T(1), box_var_h = T(1);
         if (prior_box_var) {
-          target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[offset] * prior_box_width +
-                                prior_box_center_x;
-          target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[offset + 1] *
-                                    prior_box_height +
-                                prior_box_center_y;
-          target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[offset + 2]) *
-                             prior_box_width;
-          target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[offset + 3]) *
-                              prior_box_height;
-        } else {
-          target_box_center_x =
-              target_box_data[offset] * prior_box_width + prior_box_center_x;
-          target_box_center_y = target_box_data[offset + 1] * prior_box_height +
-                                prior_box_center_y;
-          target_box_width =
-              std::exp(target_box_data[offset + 2]) * prior_box_width;
-          target_box_height =
-              std::exp(target_box_data[offset + 3]) * prior_box_height;
+          int prior_var_offset = 0;
+          if (prior_box_var->dims().size() == 2) {
+            if (axis == 0)
+              prior_var_offset = j * len;
+            else if (axis == 1)
+              prior_var_offset = i * len;
+          }
+          box_var_x = prior_box_var_data[prior_var_offset];
+          box_var_y = prior_box_var_data[prior_var_offset + 1];
+          box_var_w = prior_box_var_data[prior_var_offset + 2];
+          box_var_h = prior_box_var_data[prior_var_offset + 3];
+        } else if (!(variance.empty())) {
+          box_var_x = static_cast<T>(variance[0]);
+          box_var_y = static_cast<T>(variance[1]);
+          box_var_w = static_cast<T>(variance[2]);
+          box_var_h = static_cast<T>(variance[3]);
         }
+        target_box_center_x =
+            box_var_x * target_box_data[offset] * prior_box_width +
+            prior_box_center_x;
+        target_box_center_y =
+            box_var_y * target_box_data[offset + 1] * prior_box_height +
+            prior_box_center_y;
+        target_box_width =
+            std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+        target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) *
+                            prior_box_height;
 
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
@@ -157,26 +179,40 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
+    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
+    const int axis = context.Attr<int>("axis");
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
                         "Only support 1 level of LoD.");
     }
+    if (prior_box_var) {
+      PADDLE_ENFORCE(variance.empty(),
+                     "Input 'PriorBoxVar' and attribute 'variance' should not"
+                     "be used at the same time.");
+    }
+    if (!(variance.empty())) {
+      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
+                     "Size of attribute 'variance' should be 4");
+    }
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    if (code_type == BoxCodeType::kDecodeCenterSize) {
+      col = target_box->dims()[1];
+    }
     auto len = prior_box->dims()[1];
 
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
     T* output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
-                       output);
+                       variance, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized,
-                       output);
+      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
+                       variance, output);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46727c29de13c1213694540e6614a05f9008d232
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -0,0 +1,437 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+const int kBoxDim = 4;
+
+template <typename T>
+void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
+                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtSegms"),
+                   "Input(GtSegms) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"),
+                   "Input(LabelsInt32) shouldn't be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskRois"),
+        "Output(MaskRois) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("RoiHasMaskInt32"),
+        "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskInt32"),
+        "Output(MaskInt32) of GenerateMaskLabelsOp should not be null");
+
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2,
+                      "The rank of Input(GtSegms) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2,
+                      "The second dim of Input(GtSegms) must be 2.");
+    int num_classes = ctx->Attrs().Get<int>("num_classes");
+    int resolution = ctx->Attrs().Get<int>("resolution");
+
+    ctx->SetOutputDim("MaskRois", {-1, 4});
+    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
+    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+/*
+ * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
+ * to encode class specific mask targets.
+ */
+template <typename T>
+static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& masks,
+                                    const Tensor& mask_class_labels,
+                                    const int resolution, const int num_classes,
+                                    Tensor* mask_targets) {
+  const uint8_t* masks_data = masks.data<uint8_t>();
+  int64_t num_mask = masks.dims()[0];
+  const int* mask_class_labels_data = mask_class_labels.data<int>();
+  const int M = resolution * resolution;
+  const int mask_dim = M * num_classes;
+
+  int* mask_targets_data =
+      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
+  math::set_constant(ctx, mask_targets, -1);
+  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
+    int cls = mask_class_labels_data[mask_id];
+    int start = M * cls;
+    if (cls > 0) {
+      for (int i = 0; i < M; ++i) {
+        mask_targets_data[mask_id * mask_dim + start + i] =
+            static_cast<int>(masks_data[mask_id * M + i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+std::vector<Tensor> SampleMaskForOneImage(
+    const platform::CPUDeviceContext& ctx, const Tensor& im_info,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms,
+    const Tensor& rois, const Tensor& label_int32, const int num_classes,
+    const int resolution, const framework::LoD& segm_length) {
+  // Prepare the mask targets by associating one gt mask to each training roi
+  // that has a fg (non-bg) class label.
+  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
+  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
+  const int* gt_classes_data = gt_classes.data<int>();
+  const int* is_crowd_data = is_crowd.data<int>();
+  const int* label_int32_data = label_int32.data<int>();
+  PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]);
+
+  std::vector<int> mask_gt_inds, fg_inds;
+  std::vector<std::vector<std::vector<T>>> gt_polys;
+
+  auto polys_num = segm_length[1];
+  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
+  auto lod1 = segm_lod_offset[1];
+  auto lod2 = segm_lod_offset[2];
+  const T* polys_data = gt_segms.data<T>();
+  for (int64_t i = 0; i < gt_size; ++i) {
+    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
+      mask_gt_inds.emplace_back(i);
+
+      // slice fg segmentation polys
+      int poly_num = polys_num[i];
+      std::vector<std::vector<T>> polys;
+      int s_idx = lod1[i];
+      for (int j = 0; j < poly_num; ++j) {
+        int s = lod2[s_idx + j];
+        int e = lod2[s_idx + j + 1];
+        PADDLE_ENFORCE_NE(s, e);
+        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
+        polys.push_back(plts);
+      }
+      gt_polys.push_back(polys);
+    }
+  }
+  for (int64_t i = 0; i < roi_size; ++i) {
+    if (label_int32_data[i] > 0) {
+      fg_inds.emplace_back(i);
+    }
+  }
+  int gt_num = mask_gt_inds.size();
+  int fg_num = fg_inds.size();
+
+  Tensor boxes_from_polys;
+  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
+  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
+
+  std::vector<int> roi_has_mask =
+      std::vector<int>(fg_inds.begin(), fg_inds.end());
+  Tensor mask_class_labels;
+  Tensor masks;
+  Tensor rois_fg;
+
+  auto im_scale = im_info.data<T>()[2];
+  if (fg_num > 0) {
+    // Class labels for the foreground rois
+    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
+    Gather<int>(label_int32_data, 1, fg_inds.data(), fg_inds.size(),
+                mask_class_labels.data<int>());
+
+    uint8_t* masks_data = masks.mutable_data<uint8_t>(
+        {fg_num, resolution * resolution}, ctx.GetPlace());
+
+    // Find overlap between all foreground rois and the bounding boxes
+    // enclosing each segmentation
+    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
+    Gather<T>(rois.data<T>(), 4, fg_inds.data(), fg_inds.size(),
+              rois_fg.data<T>());
+
+    for (int k = 0; k < rois_fg.numel(); ++k) {
+      rois_fg_data[k] = rois_fg_data[k] / im_scale;
+    }
+
+    Tensor overlaps_bbfg_bbpolys;
+    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
+    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
+
+    // Map from each fg rois to the index of the mask with highest overlap
+    // (measured by bbox overlap)
+    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
+    std::vector<int> fg_masks_inds;
+    for (int64_t i = 0; i < fg_num; ++i) {
+      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
+      T max_overlap = std::numeric_limits<T>::min();
+      int id = 0;
+      for (int64_t j = 0; j < gt_num; ++j) {
+        if (v[j] > max_overlap) {
+          max_overlap = v[j];
+          id = j;
+        }
+      }
+      fg_masks_inds.push_back(id);
+    }
+
+    // add fg targets
+    for (int64_t i = 0; i < fg_num; ++i) {
+      int fg_polys_ind = fg_masks_inds[i];
+      T* roi_fg = rois_fg_data + i * 4;
+      uint8_t* mask = masks_data + i * resolution * resolution;
+      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
+    }
+  } else {
+    // The network cannot handle empty blobs, so we must provide a mask
+    // We simply take the first bg roi, given it an all -1's mask (ignore
+    // label), and label it with class zero (bg).
+    int bg_num = 1;
+    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
+    const T* rois_data = rois.data<T>();
+    std::vector<int> bg_inds;
+    for (int64_t i = 0; i < roi_size; ++i) {
+      if (label_int32_data[i] == 0) {
+        bg_inds.emplace_back(i);
+        rois_fg_data[0] = rois_data[0] / im_scale;
+        rois_fg_data[1] = rois_data[1] / im_scale;
+        rois_fg_data[2] = rois_data[2] / im_scale;
+        rois_fg_data[3] = rois_data[3] / im_scale;
+        break;
+      }
+    }
+    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
+                                ctx.GetPlace());
+    math::set_constant(ctx, &masks, -1);
+    int* mask_class_labels_data =
+        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
+    mask_class_labels_data[0] = 0;
+    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
+  }
+
+  Tensor masks_expand;
+  ExpandMaskTarget<T>(ctx, masks, mask_class_labels, resolution, num_classes,
+                      &masks_expand);
+
+  T* rois_fg_data = rois_fg.data<T>();
+  for (int k = 0; k < rois_fg.numel(); ++k) {
+    rois_fg_data[k] = rois_fg_data[k] * im_scale;
+  }
+
+  Tensor roi_has_mask_t;
+  int roi_has_mask_size = roi_has_mask.size();
+  int* roi_has_mask_data =
+      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
+  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
+
+  std::vector<Tensor> res;
+  res.emplace_back(rois_fg);
+  res.emplace_back(roi_has_mask_t);
+  res.emplace_back(masks_expand);
+  return res;
+}
+
+template <typename T>
+class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
+    auto* gt_classes = ctx.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = ctx.Input<LoDTensor>("IsCrowd");
+    auto* gt_segms = ctx.Input<LoDTensor>("GtSegms");
+    auto* rois = ctx.Input<LoDTensor>("Rois");
+    auto* label_int32 = ctx.Input<LoDTensor>("LabelsInt32");
+
+    auto* mask_rois = ctx.Output<LoDTensor>("MaskRois");
+    auto* roi_has_mask_int32 = ctx.Output<LoDTensor>("RoiHasMaskInt32");
+    auto* mask_int32 = ctx.Output<LoDTensor>("MaskInt32");
+
+    int num_classes = ctx.Attr<int>("num_classes");
+    int resolution = ctx.Attr<int>("resolution");
+
+    PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp is_crowd needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp rois needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp label_int32 needs 1 level of LoD");
+
+    PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL);
+
+    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
+    PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
+
+    int mask_dim = num_classes * resolution * resolution;
+
+    mask_rois->mutable_data<T>({rois->numel(), kBoxDim}, ctx.GetPlace());
+    roi_has_mask_int32->mutable_data<int>({rois->numel(), 1}, ctx.GetPlace());
+    mask_int32->mutable_data<int>({rois->numel(), mask_dim}, ctx.GetPlace());
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+
+    int64_t num_mask = 0;
+    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+
+    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    auto rois_lod = rois->lod().back();
+    auto label_int32_lod = label_int32->lod().back();
+    auto gt_segms_lod = gt_segms->lod();
+
+    for (int i = 0; i < n; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor gt_classes_slice =
+          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor label_int32_slice =
+          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
+      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+
+      auto sub_lod_and_offset =
+          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
+      auto lod_length = sub_lod_and_offset.first;
+      size_t s = sub_lod_and_offset.second.first;
+      size_t e = sub_lod_and_offset.second.second;
+      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+
+      std::vector<Tensor> tensor_output = SampleMaskForOneImage<T>(
+          dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice,
+          gt_segms_slice, rois_slice, label_int32_slice, num_classes,
+          resolution, lod_length);
+
+      Tensor sampled_mask_rois = tensor_output[0];
+      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
+      Tensor sampled_mask_int32 = tensor_output[2];
+
+      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
+      AppendMask<int>(roi_has_mask_int32, num_mask,
+                      &sampled_roi_has_mask_int32);
+      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
+
+      num_mask += sampled_mask_rois.dims()[0];
+      lod0.emplace_back(num_mask);
+    }
+
+    lod.emplace_back(lod0);
+    mask_rois->set_lod(lod);
+    roi_has_mask_int32->set_lod(lod);
+    mask_int32->set_lod(lod);
+    mask_rois->Resize({num_mask, kBoxDim});
+    roi_has_mask_int32->Resize({num_mask, 1});
+    mask_int32->Resize({num_mask, mask_dim});
+  }
+};
+
+class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtSegms",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD "
+        "level is 3. The LoD[0] represents the gt objects number of each "
+        "instance. LoD[1] represents the segmentation counts of each objects. "
+        "LoD[2] represents the polygons number of each segmentation. S the "
+        "total number of polygons coordinate points. Each element is (x, y) "
+        "coordinate points.");
+    AddInput(
+        "Rois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. "
+        "R is the number of rois which is the output of "
+        "generate_proposal_labels, "
+        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
+    AddInput("LabelsInt32",
+             "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
+             "each element repersents a class label of a roi");
+    AddOutput(
+        "MaskRois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P is the number of mask, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("RoiHasMaskInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
+              "each element repersents the output mask rois index with regard "
+              "to input rois");
+    AddOutput("MaskInt32",
+              "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
+              "Q equal to num_classes * resolution * resolution");
+
+    AddAttr<int>("num_classes", "Class number.");
+    AddAttr<int>("resolution", "Resolution of mask.");
+
+    AddComment(R"DOC(
+This operator can be, for given the RoIs and corresponding labels,
+to sample foreground RoIs. This mask branch also has
+a :math: `K \\times M^{2}` dimensional output targets for each foreground
+RoI, which encodes K binary masks of resolution M x M, one for each of the
+K classes. This mask targets are used to compute loss of mask branch.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp,
+                  ops::GenerateMaskLabelsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(generate_mask_labels,
+                       ops::GenerateMaskLabelsKernel<float>);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index a652d4d95750ff89f0ef63338031e80eed6f92bb..5b2e571baf390bfa9b4bdfa6e0f151102de709fc 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -48,20 +48,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                    "Input(GtBoxes) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
 
-    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
-                   "Output(Rois) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Rois"),
+        "Output(Rois) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("LabelsInt32"),
-        "Output(LabelsInt32) of RpnTargetAssignOp should not be null");
+        "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("BboxTargets"),
-        "Output(BboxTargets) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxInsideWeights"),
-        "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxOutsideWeights"),
-        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
+        "Output(BboxTargets) of GenerateProposalLabelsOp should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"),
+                   "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"),
+                   "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -225,30 +226,36 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
+    const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes,
+    const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
+    const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
     std::minstd_rand engine, bool use_random) {
-  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale = im_info->data<T>()[2];
-  rpn_rois_et = rpn_rois_et / im_scale;
+  auto im_scale = im_info.data<T>()[2];
+
+  Tensor rpn_rois;
+  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
+  T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
+  for (int i = 0; i < rpn_rois.numel(); ++i) {
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+  }
 
   Tensor boxes;
-  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
+  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, *gt_boxes, *rpn_rois, &boxes);
+  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
 
   // Overlaps
   Tensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes->dims()[0]},
+  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
-  BboxOverlaps<T>(boxes, *gt_boxes, &proposal_to_gt_overlaps);
+  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im,
       fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
@@ -263,7 +270,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
+  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
@@ -397,8 +404,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
-          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
+          gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
           engine, use_random);
       Tensor sampled_rois = tensor_output[0];
@@ -467,7 +474,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "P usuall equal to  batch_size_per_im * batch_size, "
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
               "each element repersents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd6fee713815345152fce73e85a45aa5cd68b1da
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <math.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+
+static inline int Compare(const void* a, const void* b) {
+  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
+  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
+  return c > d ? 1 : c < d ? -1 : 0;
+}
+
+void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
+  uint8_t v = 0;
+  for (int j = 0; j < m; j++) {
+    for (uint32_t k = 0; k < cnts[j]; k++) {
+      *(mask++) = v;
+    }
+    v = !v;
+  }
+}
+
+typedef uint32_t uint;
+void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
+  int j, m = 0;
+  double scale = 5;
+  int *x, *y, *u, *v;
+  uint *a, *b;
+  platform::CPUPlace cpu;
+  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
+  x = reinterpret_cast<int*>(xptr->ptr());
+  y = x + (k + 1);
+
+  for (j = 0; j < k; j++) x[j] = static_cast<int>(scale * xy[j * 2 + 0] + .5);
+  x[k] = x[0];
+  for (j = 0; j < k; j++) y[j] = static_cast<int>(scale * xy[j * 2 + 1] + .5);
+  y[k] = y[0];
+  for (j = 0; j < k; j++) {
+    m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1;
+  }
+  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
+  u = reinterpret_cast<int*>(vptr->ptr());
+  v = u + m;
+  m = 0;
+  for (j = 0; j < k; j++) {
+    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
+    int flip;
+    double s;
+    dx = abs(xe - xs);
+    dy = abs(ys - ye);
+    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
+    if (flip) {
+      t = xs;
+      xs = xe;
+      xe = t;
+      t = ys;
+      ys = ye;
+      ye = t;
+    }
+    if (dx >= dy) {
+      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
+      for (d = 0; d <= dx; d++) {
+        t = flip ? dx - d : d;
+        u[m] = t + xs;
+        v[m] = static_cast<int>(ys + s * t + .5);
+        m++;
+      }
+    } else {
+      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
+      for (d = 0; d <= dy; d++) {
+        t = flip ? dy - d : d;
+        v[m] = t + ys;
+        u[m] = static_cast<int>(xs + s * t + .5);
+        m++;
+      }
+    }
+  }
+  /* get points along y-boundary and downsample */
+  k = m;
+  m = 0;
+  double xd, yd;
+  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
+  x = reinterpret_cast<int*>(xyptr->ptr());
+  y = x + k;
+  for (j = 1; j < k; j++) {
+    if (u[j] != u[j - 1]) {
+      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
+      xd = (xd + .5) / scale - .5;
+      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
+      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
+      yd = (yd + .5) / scale - .5;
+      if (yd < 0)
+        yd = 0;
+      else if (yd > h)
+        yd = h;
+      yd = ceil(yd);
+      x[m] = static_cast<int>(xd);
+      y[m] = static_cast<int>(yd);
+      m++;
+    }
+  }
+  /* compute rle encoding given y-boundary points */
+  k = m;
+  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
+  a = reinterpret_cast<uint*>(aptr->ptr());
+  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
+  a[k++] = static_cast<uint>(h * w);
+
+  qsort(a, k, sizeof(uint), Compare);
+  uint p = 0;
+  for (j = 0; j < k; j++) {
+    uint t = a[j];
+    a[j] -= p;
+    p = t;
+  }
+  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
+  b = reinterpret_cast<uint32_t*>(bptr->ptr());
+  j = m = 0;
+  b[m++] = a[j++];
+  while (j < k) {
+    if (a[j] > 0) {
+      b[m++] = a[j++];
+    } else {
+      j++;
+      if (j < k) b[m - 1] += a[j++];
+    }
+  }
+
+  // convert to mask
+  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
+  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
+  Decode(b, m, msk);
+
+  for (int ii = 0; ii < h; ++ii) {
+    for (int jj = 0; jj < w; ++jj) {
+      mask[ii * w + jj] = msk[jj * h + ii];
+    }
+  }
+}
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes) {
+  // lists
+  for (size_t i = 0; i < polys.size(); ++i) {
+    float x0 = std::numeric_limits<float>::max();
+    float x1 = std::numeric_limits<float>::min();
+    float y0 = std::numeric_limits<float>::max();
+    float y1 = std::numeric_limits<float>::min();
+    // each list may have more than one polys
+    for (size_t j = 0; j < polys[i].size(); ++j) {
+      for (size_t k = 0; k < polys[i][j].size() / 2; ++k) {
+        x0 = std::min(x0, polys[i][j][2 * k]);
+        x1 = std::max(x1, polys[i][j][2 * k]);
+        y0 = std::min(y0, polys[i][j][2 * k + 1]);
+        y1 = std::max(y1, polys[i][j][2 * k + 1]);
+      }
+    }
+    boxes[i * 4] = x0;
+    boxes[i * 4 + 1] = y0;
+    boxes[i * 4 + 2] = x1;
+    boxes[i * 4 + 3] = y1;
+  }
+}
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask) {
+  float w = box[2] - box[0];
+  float h = box[3] - box[1];
+  w = std::max(w, static_cast<float>(1.));
+  h = std::max(h, static_cast<float>(1.));
+
+  uint8_t* msk = nullptr;
+  if (polygons.size() == 1UL) {
+    msk = mask;
+  } else {
+    msk = reinterpret_cast<uint8_t*>(
+        malloc(M * M * polygons.size() * sizeof(uint8_t)));
+  }
+  for (size_t i = 0; i < polygons.size(); ++i) {
+    int k = polygons[i].size() / 2;
+    std::vector<float> p;
+    for (int j = 0; j < k; ++j) {
+      float pw = (polygons[i][2 * j] - box[0]) * M / w;
+      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;
+      p.push_back(pw);
+      p.push_back(ph);
+    }
+    uint8_t* msk_i = msk + i * M * M;
+    Poly2Mask(p.data(), k, M, M, msk_i);
+  }
+
+  if (polygons.size() > 1UL) {
+    for (size_t i = 0; i < polygons.size(); ++i) {
+      uint8_t* msk_i = msk + i * M * M;
+      for (int j = 0; j < M * M; ++j) {
+        if (i == 0) {
+          mask[j] = msk_i[j];
+        } else {
+          mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
+        }
+      }
+    }
+    free(msk);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e0ea54f6d89ff273382afc1e9a151cfd9773cc6
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes);
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de904e947463977229545897b723b98b4d0708d6
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void Compare(const T* a, const T* b, const int n) {
+  for (int i = 0; i < n; i++) {
+    EXPECT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(MaskUtil, Poly2MaskTest) {
+  float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f,
+                   6.53f, 5.94f, 6.38f, 1.97f, 1.88f};
+  int h = 8, w = 8;
+  int k = 5;  // length(polys) / 2
+  // clang-format off
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0
+  };
+  // clang-format on
+
+  // the groud-truth mask is computed by coco API:
+  //
+  // import pycocotools.mask as mask_util
+  // import numpy as np
+  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
+  // rles = mask_util.frPyObjects([segm], im_h, im_w)
+  // mask = mask_util.decode(rles)
+  // print mask
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
+  Poly2Mask(polys, k, h, w, mask);
+  Compare<uint8_t>(expect_mask, mask, h * w);
+}
+
+TEST(MaskUtil, Poly2BoxesTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {
+      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
+      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
+  };
+  float expect_boxes[] = {
+      1.69f, 1.88f, 5.94f, 6.53f,
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 8);
+}
+
+TEST(MaskUtil, Polys2MaskWrtBoxTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {{
+      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
+      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
+  float expect_boxes[] = {
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 1, 1, 1, 1, 1, 1, 0,
+      1, 1, 1, 1, 1, 1, 1, 1
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 4);
+
+  auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocat_mask->ptr());
+  int M = 8;
+  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
+  Compare<uint8_t>(expect_mask, mask, M * M);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 2395b181485429784e0f3dff6d056b84268ef245..f357e3ccf905309e6656f3fa87fbee45dc357c1e 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -9,9 +9,9 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
 limitations under the License. */
 
+#include <glog/logging.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/poly_util.h"
 
@@ -35,30 +35,45 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
 
     auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
+    auto score_size = score_dims.size();
 
     if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE(score_size == 2 || score_size == 3,
+                     "The rank of Input(Scores) must be 2 or 3");
       PADDLE_ENFORCE_EQ(box_dims.size(), 3,
-                        "The rank of Input(BBoxes) must be 3.");
-      PADDLE_ENFORCE_EQ(score_dims.size(), 3,
-                        "The rank of Input(Scores) must be 3.");
-      PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 ||
-                         box_dims[2] == 16 || box_dims[2] == 24 ||
-                         box_dims[2] == 32,
-                     "The 2nd dimension of Input(BBoxes) must be 4 or 8, "
-                     "represents the layout of coordinate "
-                     "[xmin, ymin, xmax, ymax] or "
-                     "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                     "8 points: [xi, yi] i= 1,2,...,8 or "
-                     "12 points: [xi, yi] i= 1,2,...,12 or "
-                     "16 points: [xi, yi] i= 1,2,...,16");
-      PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
-                        "The 1st dimensiong of Input(BBoxes) must be equal to "
-                        "3rd dimension of Input(Scores), which represents the "
-                        "predicted bboxes.");
+                        "The rank of Input(BBoxes) must be 3");
+      if (score_size == 3) {
+        PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 ||
+                           box_dims[2] == 16 || box_dims[2] == 24 ||
+                           box_dims[2] == 32,
+                       "The last dimension of Input(BBoxes) must be 4 or 8, "
+                       "represents the layout of coordinate "
+                       "[xmin, ymin, xmax, ymax] or "
+                       "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                       "8 points: [xi, yi] i= 1,2,...,8 or "
+                       "12 points: [xi, yi] i= 1,2,...,12 or "
+                       "16 points: [xi, yi] i= 1,2,...,16");
+        PADDLE_ENFORCE_EQ(
+            box_dims[1], score_dims[2],
+            "The 2nd dimension of Input(BBoxes) must be equal to "
+            "last dimension of Input(Scores), which represents the "
+            "predicted bboxes.");
+      } else {
+        PADDLE_ENFORCE(box_dims[2] == 4,
+                       "The last dimension of Input(BBoxes) must be 4");
+        PADDLE_ENFORCE_EQ(box_dims[1], score_dims[1],
+                          "The 2nd dimension of Input(BBoxes)"
+                          "must be equal to the 2nd dimension"
+                          " of Input(Scores)");
+      }
     }
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+    if (score_size == 3) {
+      ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+    } else {
+      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
+    }
   }
 
  protected:
@@ -123,8 +138,9 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
     const T inter_ymin = std::max(box1[1], box2[1]);
     const T inter_xmax = std::min(box1[2], box2[2]);
     const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
     const T inter_area = inter_w * inter_h;
     const T bbox1_area = BBoxArea<T>(box1, normalized);
     const T bbox2_area = BBoxArea<T>(box2, normalized);
@@ -139,7 +155,7 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size,
   T bbox2_area = PolyArea<T>(box2, box_size, normalized);
   T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
   if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are is invalid
+    // If coordinate values are invalid
     // if area size <= 0,  return 0.
     return T(0.);
   } else {
@@ -147,12 +163,35 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size,
   }
 }
 
+template <class T>
+void SliceOneClass(const platform::DeviceContext& ctx,
+                   const framework::Tensor& items, const int class_id,
+                   framework::Tensor* one_class_item) {
+  T* item_data = one_class_item->mutable_data<T>(ctx.GetPlace());
+  const T* items_data = items.data<T>();
+  const int64_t num_item = items.dims()[0];
+  const int class_num = items.dims()[1];
+  if (items.dims().size() == 3) {
+    int item_size = items.dims()[2];
+    for (int i = 0; i < num_item; ++i) {
+      std::memcpy(item_data + i * item_size,
+                  items_data + i * class_num * item_size + class_id * item_size,
+                  sizeof(T) * item_size);
+    }
+  } else {
+    for (int i = 0; i < num_item; ++i) {
+      item_data[i] = items_data[i * class_num + class_id];
+    }
+  }
+}
+
 template <typename T>
 class MultiClassNMSKernel : public framework::OpKernel<T> {
  public:
   void NMSFast(const Tensor& bbox, const Tensor& scores,
                const T score_threshold, const T nms_threshold, const T eta,
-               const int64_t top_k, std::vector<int>* selected_indices) const {
+               const int64_t top_k, std::vector<int>* selected_indices,
+               const bool normalized) const {
     // The total boxes for each instance.
     int64_t num_boxes = bbox.dims()[0];
     // 4: [xmin ymin xmax ymax]
@@ -178,15 +217,16 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
           T overlap = T(0.);
           // 4: [xmin ymin xmax ymax]
           if (box_size == 4) {
-            overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                        bbox_data + kept_idx * box_size, true);
+            overlap =
+                JaccardOverlap<T>(bbox_data + idx * box_size,
+                                  bbox_data + kept_idx * box_size, normalized);
           }
           // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
           if (box_size == 8 || box_size == 16 || box_size == 24 ||
               box_size == 32) {
-            overlap =
-                PolyIoU<T>(bbox_data + idx * box_size,
-                           bbox_data + kept_idx * box_size, box_size, true);
+            overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                                 bbox_data + kept_idx * box_size, box_size,
+                                 normalized);
           }
           keep = overlap <= adaptive_threshold;
         } else {
@@ -205,37 +245,58 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
   void MultiClassNMS(const framework::ExecutionContext& ctx,
                      const Tensor& scores, const Tensor& bboxes,
+                     const int scores_size,
                      std::map<int, std::vector<int>>* indices,
                      int* num_nmsed_out) const {
     int64_t background_label = ctx.Attr<int>("background_label");
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
     int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    bool normalized = ctx.Attr<bool>("normalized");
     T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
     T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
     T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
 
-    int64_t class_num = scores.dims()[0];
-    int64_t predict_dim = scores.dims()[1];
     int num_det = 0;
+
+    int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+    Tensor bbox_slice, score_slice;
     for (int64_t c = 0; c < class_num; ++c) {
       if (c == background_label) continue;
-      Tensor score = scores.Slice(c, c + 1);
-      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &((*indices)[c]));
+      if (scores_size == 3) {
+        score_slice = scores.Slice(c, c + 1);
+        bbox_slice = bboxes;
+      } else {
+        score_slice.Resize({scores.dims()[0], 1});
+        bbox_slice.Resize({scores.dims()[0], 4});
+        SliceOneClass<T>(dev_ctx, scores, c, &score_slice);
+        SliceOneClass<T>(dev_ctx, bboxes, c, &bbox_slice);
+      }
+      NMSFast(bbox_slice, score_slice, score_threshold, nms_threshold, nms_eta,
+              nms_top_k, &((*indices)[c]), normalized);
+      if (scores_size == 2) {
+        std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+      }
       num_det += (*indices)[c].size();
     }
 
     *num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
+      const T* sdata;
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
       for (const auto& it : *indices) {
         int label = it.first;
-        const T* sdata = scores_data + label * predict_dim;
+        if (scores_size == 3) {
+          sdata = scores_data + label * scores.dims()[1];
+        } else {
+          score_slice.Resize({scores.dims()[0], 1});
+          SliceOneClass<T>(dev_ctx, scores, label, &score_slice);
+          sdata = score_slice.data<T>();
+        }
         const std::vector<int>& label_indices = it.second;
         for (size_t j = 0; j < label_indices.size(); ++j) {
           int idx = label_indices[j];
-          PADDLE_ENFORCE_LT(idx, predict_dim);
           score_index_pairs.push_back(
               std::make_pair(sdata[idx], std::make_pair(label, idx)));
         }
@@ -252,31 +313,55 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
       }
+      if (scores_size == 2) {
+        for (const auto& it : new_indices) {
+          int label = it.first;
+          std::stable_sort(new_indices[label].begin(),
+                           new_indices[label].end());
+        }
+      }
       new_indices.swap(*indices);
       *num_nmsed_out = keep_top_k;
     }
   }
 
-  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+  void MultiClassOutput(const platform::DeviceContext& ctx,
+                        const Tensor& scores, const Tensor& bboxes,
                         const std::map<int, std::vector<int>>& selected_indices,
-                        Tensor* outs) const {
+                        const int scores_size, Tensor* outs) const {
+    int64_t class_num = scores.dims()[1];
     int64_t predict_dim = scores.dims()[1];
     int64_t box_size = bboxes.dims()[1];
-    int64_t out_dim = bboxes.dims()[1] + 2;
+    if (scores_size == 2) {
+      box_size = bboxes.dims()[2];
+    }
+    int64_t out_dim = box_size + 2;
     auto* scores_data = scores.data<T>();
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
-
+    const T* sdata;
+    Tensor bbox;
+    bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
     for (const auto& it : selected_indices) {
       int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
       const std::vector<int>& indices = it.second;
+      if (scores_size == 2) {
+        SliceOneClass<T>(ctx, bboxes, label, &bbox);
+      } else {
+        sdata = scores_data + label * predict_dim;
+      }
       for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
-        const T* bdata = bboxes_data + idx * box_size;
-        odata[count * out_dim] = label;           // label
-        odata[count * out_dim + 1] = sdata[idx];  // score
+        odata[count * out_dim] = label;  // label
+        const T* bdata;
+        if (scores_size == 3) {
+          bdata = bboxes_data + idx * box_size;
+          odata[count * out_dim + 1] = sdata[idx];  // score
+        } else {
+          bdata = bbox.data<T>() + idx * box_size;
+          odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+        }
         // xmin, ymin, xmax, ymax or multi-points coordinates
         std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
         count++;
@@ -285,52 +370,64 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes = ctx.Input<Tensor>("BBoxes");
-    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* boxes = ctx.Input<LoDTensor>("BBoxes");
+    auto* scores = ctx.Input<LoDTensor>("Scores");
     auto* outs = ctx.Output<LoDTensor>("Out");
 
     auto score_dims = scores->dims();
-
-    int64_t batch_size = score_dims[0];
-    int64_t class_num = score_dims[1];
-    int64_t predict_dim = score_dims[2];
-    int64_t box_dim = boxes->dims()[2];
-    int64_t out_dim = boxes->dims()[2] + 2;
+    auto score_size = score_dims.size();
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
 
     std::vector<std::map<int, std::vector<int>>> all_indices;
     std::vector<size_t> batch_starts = {0};
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      Tensor ins_boxes = boxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
+    int64_t batch_size = score_dims[0];
+    int64_t box_dim = boxes->dims()[2];
+    int64_t out_dim = box_dim + 2;
+    int num_nmsed_out = 0;
+    Tensor boxes_slice, scores_slice;
+    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    for (int i = 0; i < n; ++i) {
+      if (score_size == 3) {
+        scores_slice = scores->Slice(i, i + 1);
+        scores_slice.Resize({score_dims[1], score_dims[2]});
+        boxes_slice = boxes->Slice(i, i + 1);
+        boxes_slice.Resize({score_dims[2], box_dim});
+      } else {
+        auto boxes_lod = boxes->lod().back();
+        scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
+        boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
+      }
       std::map<int, std::vector<int>> indices;
-      int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out);
+      MultiClassNMS(ctx, scores_slice, boxes_slice, score_size, &indices,
+                    &num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
 
     int num_kept = batch_starts.back();
     if (num_kept == 0) {
-      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      T* od = outs->mutable_data<T>({1, 1}, ctx.GetPlace());
       od[0] = -1;
+      batch_starts = {0, 1};
     } else {
       outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      for (int64_t i = 0; i < batch_size; ++i) {
-        Tensor ins_score = scores->Slice(i, i + 1);
-        ins_score.Resize({class_num, predict_dim});
-
-        Tensor ins_boxes = boxes->Slice(i, i + 1);
-        ins_boxes.Resize({predict_dim, box_dim});
-
+      for (int i = 0; i < n; ++i) {
+        if (score_size == 3) {
+          scores_slice = scores->Slice(i, i + 1);
+          boxes_slice = boxes->Slice(i, i + 1);
+          scores_slice.Resize({score_dims[1], score_dims[2]});
+          boxes_slice.Resize({score_dims[2], box_dim});
+        } else {
+          auto boxes_lod = boxes->lod().back();
+          scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
+          boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
+        }
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
           Tensor out = outs->Slice(s, e);
-          MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
+          MultiClassOutput(dev_ctx, scores_slice, boxes_slice, all_indices[i],
+                           score_dims.size(), &out);
         }
       }
     }
@@ -346,17 +443,24 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("BBoxes",
-             "(Tensor) A 3-D Tensor with shape "
+             "Two types of bboxes are supported:"
+             "1. (Tensor) A 3-D Tensor with shape "
              "[N, M, 4 or 8 16 24 32] represents the "
              "predicted locations of M bounding bboxes, N is the batch size. "
              "Each bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax], when box size equals to 4.");
+             "[xmin, ymin, xmax, ymax], when box size equals to 4."
+             "2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]"
+             "M is the number of bounding boxes, C is the class number");
     AddInput("Scores",
-             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "Two types of scores are supported:"
+             "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the "
              "predicted confidence predictions. N is the batch size, C is the "
              "class number, M is number of bounding boxes. For each category "
              "there are total M scores which corresponding M bounding boxes. "
-             " Please note, M is equal to the 1st dimension of BBoxes. ");
+             " Please note, M is equal to the 2nd dimension of BBoxes. "
+             "2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. "
+             "M is the number of bbox, C is the class number. In this case, "
+             "Input BBoxes should be the second case with shape [M, C, 4].");
     AddAttr<int>(
         "background_label",
         "(int, defalut: 0) "
@@ -384,6 +488,10 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int64_t) "
                  "Number of total bboxes to be kept per image after NMS "
                  "step. -1 means keeping all bboxes after NMS step.");
+    AddAttr<bool>("normalized",
+                  "(bool, default true) "
+                  "Whether detections are normalized.")
+        .SetDefault(true);
     AddOutput("Out",
               "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
               "detections. Each row has 6 values: "
@@ -399,24 +507,21 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 This operator is to do multi-class non maximum suppression (NMS) on a batched
 of boxes and scores.
-
 In the NMS step, this operator greedily selects a subset of detection bounding
 boxes that have high scores larger than score_threshold, if providing this
 threshold, then selects the largest nms_top_k confidences scores if nms_top_k
 is larger than -1. Then this operator pruns away boxes that have high IOU
 (intersection over union) overlap with already selected boxes by adaptive
 threshold NMS based on parameters of nms_threshold and nms_eta.
-
 Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
 per image if keep_top_k is larger than -1.
-
 This operator support multi-class and batched inputs. It applying NMS
 independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. If there is no detected boxes
-for all images, all the elements in LoD are 0, and the Out only contains one
-value which is -1.
+for all images, all the elements in LoD are set to {1}, and the Out only 
+contains one value which is -1.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
similarity index 69%
rename from paddle/fluid/operators/yolov3_loss_op.cc
rename to paddle/fluid/operators/detection/yolov3_loss_op.cc
index 60508f7ab871910c38f1e4aa04c2035075d37df5..2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -9,7 +9,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/operators/detection/yolov3_loss_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                    "Input(GTLabel) of Yolov3LossOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Loss"),
                    "Output(Loss) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ObjectnessMask"),
+        "Output(ObjectnessMask) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"),
+                   "Output(GTMatchMask) of Yolov3LossOp should not be null.");
 
     auto dim_x = ctx->GetInputDim("X");
     auto dim_gtbox = ctx->GetInputDim("GTBox");
     auto dim_gtlabel = ctx->GetInputDim("GTLabel");
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto anchor_mask = ctx->Attrs().Get<std::vector<int>>("anchor_mask");
+    int mask_num = anchor_mask.size();
     auto class_num = ctx->Attrs().Get<int>("class_num");
+
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
     PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
                       "Input(X) dim[3] and dim[4] should be euqal.");
-    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
-                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
-                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], mask_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
     PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
                       "Input(GTBox) should be a 3-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
     PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
-                      "Input(GTBox) should be a 2-D tensor");
+                      "Input(GTLabel) should be a 2-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
                       "Input(GTBox) and Input(GTLabel) dim[0] should be same");
     PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
@@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                       "Attr(anchors) length should be greater then 0.");
     PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
                       "Attr(anchors) length should be even integer.");
+    for (size_t i = 0; i < anchor_mask.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          anchor_mask[i], anchor_num,
+          "Attr(anchor_mask) should not crossover Attr(anchors).");
+    }
     PADDLE_ENFORCE_GT(class_num, 0,
                       "Attr(class_num) should be an integer greater then 0.");
 
-    std::vector<int64_t> dim_out({1});
+    std::vector<int64_t> dim_out({dim_x[0]});
     ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+
+    std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
+    ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask));
+
+    std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
+    ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask));
   }
 
  protected:
@@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of YOLO v3 loss operator, "
+             "The input tensor of YOLOv3 loss operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
              "H and W should be same, and the second dimention(C) stores"
              "box locations, confidence score and classification one-hot"
-             "key of each anchor box");
+             "keys of each anchor box");
     AddInput("GTBox",
              "The input tensor of ground truth boxes, "
              "This is a 3-D tensor with shape of [N, max_box_num, 5], "
@@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("GTLabel",
              "The input tensor of ground truth label, "
              "This is a 2-D tensor with shape of [N, max_box_num], "
-             "and each element shoudl be an integer to indicate the "
+             "and each element should be an integer to indicate the "
              "box class id.");
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
-              "This is a 1-D tensor with shape of [1]");
+              "This is a 1-D tensor with shape of [N]");
+    AddOutput("ObjectnessMask",
+              "This is an intermediate tensor with shape of [N, M, H, W], "
+              "M is the number of anchor masks. This parameter caches the "
+              "mask for calculate objectness loss in gradient kernel.")
+        .AsIntermediate();
+    AddOutput("GTMatchMask",
+              "This is an intermediate tensor with shape of [N, B], "
+              "B is the max box number of GT boxes. This parameter caches "
+              "matched mask index of each GT boxes for gradient calculate.")
+        .AsIntermediate();
 
     AddAttr<int>("class_num", "The number of classes to predict.");
     AddAttr<std::vector<int>>("anchors",
                               "The anchor width and height, "
-                              "it will be parsed pair by pair.");
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<std::vector<int>>("anchor_mask",
+                              "The mask index of anchors used in "
+                              "current YOLOv3 loss calculation.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YOLOv3 loss "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YOLOv3 loss operators.")
+        .SetDefault(32);
     AddAttr<float>("ignore_thresh",
-                   "The ignore threshold to ignore confidence loss.");
-    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>(
-        "loss_weight_conf_target",
-        "The weight of confidence score loss in locations with target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_conf_notarget",
-                   "The weight of confidence score loss in locations without "
-                   "target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
-        .SetDefault(1.0);
+                   "The ignore threshold to ignore confidence loss.")
+        .SetDefault(0.7);
     AddComment(R"DOC(
          This operator generate yolov3 loss by given predict result and ground
          truth boxes.
@@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The MSE loss is used for 
-         box location, and binary cross entropy loss is used for confidence score 
-         loss and classification loss.
+         confidence score loss, and classification loss. The L2 loss is used for 
+         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
+         coordinates (x, y), confidence score loss and classification loss.
+
+         Each groud truth box find a best matching anchor box in all anchors, 
+         prediction of this anchor box will incur all three parts of losses, and
+         prediction of anchor boxes with no GT box matched will only incur objectness
+         loss.
+
+         In order to trade off box coordinate losses between big boxes and small 
+         boxes, box coordinate losses will be mutiplied by scale weight, which is
+         calculated as follow.
+
+         $$
+         weight_{box} = 2.0 - t_w * t_h
+         $$
 
          Final loss will be represented as follow.
 
          $$
-         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
-              + \loss_weight_{conf_target} * loss_{conf_target}
-              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
-              + \loss_weight_{class} * loss_{class}
+         loss = (loss_{xy} + loss_{wh}) * weight_{box}
+              + loss_{conf} + loss_{class}
          $$
          )DOC");
   }
@@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("GTBox", Input("GTBox"));
     op->SetInput("GTLabel", Input("GTLabel"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
+    op->SetInput("GTMatchMask", Output("GTMatchMask"));
 
     op->SetAttrMap(Attrs());
 
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8407d4e6e8f87a2e8d073c4fbda5691abe1bba68
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -0,0 +1,447 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+static inline bool LessEqualZero(T x) {
+  return x < 1e-6;
+}
+
+template <typename T>
+static T SigmoidCrossEntropy(T x, T label) {
+  return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
+}
+
+template <typename T>
+static T L2Loss(T x, T y) {
+  return 0.5 * (y - x) * (y - x);
+}
+
+template <typename T>
+static T SigmoidCrossEntropyGrad(T x, T label) {
+  return 1.0 / (1.0 + std::exp(-x)) - label;
+}
+
+template <typename T>
+static T L2LossGrad(T x, T y) {
+  return x - y;
+}
+
+static int GetMaskIndex(std::vector<int> mask, int val) {
+  for (size_t i = 0; i < mask.size(); i++) {
+    if (mask[i] == val) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+template <typename T>
+struct Box {
+  T x, y, w, h;
+};
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+                                int j, int an_idx, int grid_size,
+                                int input_size, int index, int stride) {
+  Box<T> b;
+  b.x = (i + sigmoid<T>(x[index])) / grid_size;
+  b.y = (j + sigmoid<T>(x[index + stride])) / grid_size;
+  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size;
+  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size;
+  return b;
+}
+
+template <typename T>
+static inline Box<T> GetGtBox(const T* gt, int batch, int max_boxes, int idx) {
+  Box<T> b;
+  b.x = gt[(batch * max_boxes + idx) * 4];
+  b.y = gt[(batch * max_boxes + idx) * 4 + 1];
+  b.w = gt[(batch * max_boxes + idx) * 4 + 2];
+  b.h = gt[(batch * max_boxes + idx) * 4 + 3];
+  return b;
+}
+
+template <typename T>
+static inline T BoxOverlap(T c1, T w1, T c2, T w2) {
+  T l1 = c1 - w1 / 2.0;
+  T l2 = c2 - w2 / 2.0;
+  T left = l1 > l2 ? l1 : l2;
+  T r1 = c1 + w1 / 2.0;
+  T r2 = c2 + w2 / 2.0;
+  T right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+template <typename T>
+static inline T CalcBoxIoU(Box<T> b1, Box<T> b2) {
+  T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w);
+  T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h);
+  T inter_area = (w < 0 || h < 0) ? 0.0 : w * h;
+  T union_area = b1.w * b1.h + b2.w * b2.h - inter_area;
+  return inter_area / union_area;
+}
+
+static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                int an_stride, int stride, int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
+                                std::vector<int> anchors, int an_idx,
+                                int box_idx, int gi, int gj, int grid_size,
+                                int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+}
+
+template <typename T>
+static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
+                                    Box<T> gt, std::vector<int> anchors,
+                                    int an_idx, int box_idx, int gi, int gj,
+                                    int grid_size, int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  input_grad[box_idx] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
+  input_grad[box_idx + stride] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
+  input_grad[box_idx + 2 * stride] =
+      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+  input_grad[box_idx + 3 * stride] =
+      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+}
+
+template <typename T>
+static inline void CalcLabelLoss(T* loss, const T* input, const int index,
+                                 const int label, const int class_num,
+                                 const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+  }
+}
+
+template <typename T>
+static inline void CalcLabelLossGrad(T* input_grad, const T loss,
+                                     const T* input, const int index,
+                                     const int label, const int class_num,
+                                     const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    input_grad[index + i * stride] =
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
+                                   const int n, const int an_num, const int h,
+                                   const int w, const int stride,
+                                   const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            // positive sample: obj = 1
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+          } else if (obj > -0.5) {
+            // negetive sample: obj = 0
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
+                                       const T* input, const T* objness,
+                                       const int n, const int an_num,
+                                       const int h, const int w,
+                                       const int stride, const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+          } else if (obj > -0.5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+      input_grad += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static void inline GtValid(bool* valid, const T* gtbox, const int n,
+                           const int b) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < b; j++) {
+      if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) {
+        valid[j] = false;
+      } else {
+        valid[j] = true;
+      }
+    }
+    valid += b;
+    gtbox += b * 4;
+  }
+}
+
+template <typename T>
+class Yolov3LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+    const int mask_num = anchor_mask.size();
+    const int b = gt_box->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
+    memset(loss_data, 0, loss->numel() * sizeof(T));
+    T* obj_mask_data =
+        objness_mask->mutable_data<T>({n, mask_num, h, w}, ctx.GetPlace());
+    memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T));
+    int* gt_match_mask_data =
+        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
+
+    // calc valid gt box mask, avoid calc duplicately in following code
+    Tensor gt_valid_mask;
+    bool* gt_valid_mask_data =
+        gt_valid_mask.mutable_data<bool>({n, b}, ctx.GetPlace());
+    GtValid<T>(gt_valid_mask_data, gt_box_data, n, b);
+
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < mask_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            // each predict box find a best match gt box, if overlap is bigger
+            // then ignore_thresh, ignore the objectness loss.
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0);
+            Box<T> pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j],
+                                     h, input_size, box_idx, stride);
+            T best_iou = 0;
+            for (int t = 0; t < b; t++) {
+              if (!gt_valid_mask_data[i * b + t]) {
+                continue;
+              }
+              Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+              T iou = CalcBoxIoU(pred, gt);
+              if (iou > best_iou) {
+                best_iou = iou;
+              }
+            }
+
+            // If best IoU is bigger then ignore_thresh,
+            // ignore the objectness loss.
+            if (best_iou > ignore_thresh) {
+              int obj_idx = (i * mask_num + j) * stride + k * w + l;
+              obj_mask_data[obj_idx] = static_cast<T>(-1);
+            }
+            // all losses should be calculated if best IoU
+            // is bigger then truth thresh, but currently,
+            // truth thresh is an unreachable value as 1.0.
+          }
+        }
+      }
+      for (int t = 0; t < b; t++) {
+        if (!gt_valid_mask_data[i * b + t]) {
+          gt_match_mask_data[i * b + t] = -1;
+          continue;
+        }
+        Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+        int gi = static_cast<int>(gt.x * w);
+        int gj = static_cast<int>(gt.y * h);
+        Box<T> gt_shift = gt;
+        gt_shift.x = 0.0;
+        gt_shift.y = 0.0;
+        T best_iou = 0.0;
+        int best_n = 0;
+        // each gt box find a best match anchor box as positive sample,
+        // for positive sample, all losses should be calculated, and for
+        // other samples, only objectness loss is required.
+        for (int an_idx = 0; an_idx < an_num; an_idx++) {
+          Box<T> an_box;
+          an_box.x = 0.0;
+          an_box.y = 0.0;
+          an_box.w = anchors[2 * an_idx] / static_cast<T>(input_size);
+          an_box.h = anchors[2 * an_idx + 1] / static_cast<T>(input_size);
+          float iou = CalcBoxIoU<T>(an_box, gt_shift);
+          if (iou > best_iou) {
+            best_iou = iou;
+            best_n = an_idx;
+          }
+        }
+
+        int mask_idx = GetMaskIndex(anchor_mask, best_n);
+        gt_match_mask_data[i * b + t] = mask_idx;
+        if (mask_idx >= 0) {
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
+                                 box_idx, gi, gj, h, input_size, stride);
+
+          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
+          obj_mask_data[obj_idx] = 1.0;
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
+                           class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLoss<T>(loss_data, input_data + 4 * stride, obj_mask_data, n,
+                       mask_num, h, w, stride, an_stride);
+  }
+};
+
+template <typename T>
+class Yolov3LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Input<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input_grad->dims()[0];
+    const int c = input_grad->dims()[1];
+    const int h = input_grad->dims()[2];
+    const int w = input_grad->dims()[3];
+    const int mask_num = anchor_mask.size();
+    const int b = gt_match_mask->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    const T* loss_grad_data = loss_grad->data<T>();
+    const T* obj_mask_data = objness_mask->data<T>();
+    const int* gt_match_mask_data = gt_match_mask->data<int>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
+
+    for (int i = 0; i < n; i++) {
+      for (int t = 0; t < b; t++) {
+        int mask_idx = gt_match_mask_data[i * b + t];
+        if (mask_idx >= 0) {
+          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+          int gi = static_cast<int>(gt.x * w);
+          int gj = static_cast<int>(gt.y * h);
+
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLossGrad<T>(
+              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
+              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
+                               label_idx, label, class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLossGrad<T>(input_grad_data + 4 * stride, loss_grad_data,
+                           input_data + 4 * stride, obj_mask_data, n, mask_num,
+                           h, w, stride, an_stride);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cb492f999532fff3562050135c3a1abdcda06ad5..fc28fe818dc0bd2a8607118c015b6b5fd168fb43 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory)
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
@@ -32,15 +32,17 @@ else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
   set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
+  set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+
   brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc
       variable_response.cc
       collective_client.cc collective_server.cc
       ${BRPC_SRCS}
     PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory)
+    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
 
-  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
   cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
       DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index 87bdb83503783b32720eb57bd303ad7eb4bc17a8..b8e63f42e2040730ac79c57651d86d9e3176fa01 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
                                       const platform::DeviceContext& ctx,
                                       const framework::Scope& scope,
                                       const std::string& var_name,
+                                      const std::string& out_var_name,
                                       const std::string& method_name,
                                       int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "GetRPC";
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+  const std::string method = kGetRPC;
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
     auto ch_ctx = ch_ptr->Pop();
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
     req.set_trainer_id(trainer_id_);
 
     google::protobuf::Closure* done = brpc::NewCallback(
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     platform::RecordRPCEvent record_event(method, p_ctx);
 
-    if (method_name == "GetMonomerVariable") {
+    if (method_name == kGetMonomerRPC) {
       ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else if (method_name == kGetNoBarrierRPC) {
+      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
     } else {
       ch_ctx->stub->GetVariable(cntl, &req, response, done);
     }
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
   return var_h;
 }
 
+VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_var_name, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
+                      kGetNoBarrierRPC, time_out);
+}
+
 VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 const std::string& var_name,
                                                 int64_t time_out) {
-  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
+  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_var_name,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr var_h(
       new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
 
 VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                                int64_t time_out) {
-  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
                           time_out);
 }
 
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   // var handle
   VarHandlePtr var_h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
 
 VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
-  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
 }
 
 void BRPCClient::SendComplete() {
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
   google::protobuf::Closure* done = brpc::NewCallback(
       &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-  if (method_name == "CheckPointNotifyRPC") {
+  if (method_name == kCheckPointNotifyRPC) {
     ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == "GetMonomerBarrier") {
+  } else if (method_name == kSendMonomerFetchBarrierRPC) {
     ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
   } else {
     ch_ctx->stub->SendVariable(cntl, &req, response, done);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 2066ade8a5621f2c201b76690421a943db44535e..501a593b11d35c160348e42ee47216a85647aac4 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetMonomerBarrier(
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    const std::string& out_varname,
+                                    int64_t time_out = FLAGS_rpc_deadline);
+
   VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
                             const platform::DeviceContext& ctx,
                             const framework::Scope& scope,
                             const std::string& var_name,
+                            const std::string& out_var_name,
                             const std::string& method_name,
                             int64_t time_out = FLAGS_rpc_deadline);
 
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
index cbe0bd09c7b272c35b78818aa9e26feeb5497779..fea9b09414638b607ca7f7d558ce14a2d5bfa03d 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
           rpc_server_->GetThreadNum(distributed::kRequestGet)));
     }
 
+    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
+    if (it != rpc_call_map.end()) {
+      request_getnobarrier_h_ = it->second;
+      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
+    }
+
     it = rpc_call_map.find(distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
         [=] { _GetVariable(cntl_butil, request, response, done); });
   }
 
+  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                            const VariableMessage* request,
+                            VariableMessage* response,
+                            google::protobuf::Closure* done) override {
+    getnobarrier_threads_->Run(
+        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
+  }
+
   void _GetVariable(google::protobuf::RpcController* cntl_butil,
                     const VariableMessage* request, VariableMessage* response,
                     google::protobuf::Closure* done) {
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
     brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
 
     std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     VLOG(3) << "RequestGet varname:" << varname
+            << ", out_varname:" << out_varname
             << ", trainer_id:" << request->trainer_id()
             << ", from:" << cntl->remote_side();
 
     auto scope = request_get_h_->scope();
-    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* invar = nullptr;
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                           out_varname);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(out_varname, outvar,
+                                    *request_get_h_->dev_ctx(), response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                             const VariableMessage* request,
+                             VariableMessage* response,
+                             google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
+                   "RequestGetNoBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     int trainer_id = request->trainer_id();
+
+    VLOG(3) << "RequestGetNoBarrier varname:" << varname
+            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_getnobarrier_h_->scope();
+    paddle::framework::Variable* invar = nullptr;
     paddle::framework::Variable* outvar = nullptr;
 
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                                    out_varname);
 
     if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
-                                    response, &cntl->response_attachment(), "",
-                                    false);
+      distributed::SerializeToIOBuf(
+          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
+          &cntl->response_attachment(), "", false);
     }
   }
+
   void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                         const VariableMessage* request,
                         VariableMessage* response,
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
  private:
   distributed::RequestHandler* request_send_h_{nullptr};
   distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
   distributed::RequestHandler* request_prefetch_h_{nullptr};
   distributed::RequestHandler* request_checkpoint_h_{nullptr};
   distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
 
   distributed::RPCServer* rpc_server_{nullptr};
 
-  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
   std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 7875c16c3cf412ee06fa7c8eb36400b1096f156b..52310f8d04db6a5df9967c0a5ec9a5e95a24cdab 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   SendProcessor* s = new SendProcessor(ch);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
@@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(100) << "ProcGetResponse";
+  VLOG(4) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_varname,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
                       "/sendrecv.SendRecvService/GetVariable", time_out);
 }
 
+VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_varname, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(
+      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+}
+
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
                       "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
 }
 
-VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& rpc_path,
-                                      int64_t time_out) {
+VarHandlePtr GRPCClient::_AsyncGetVar(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& method,
+    const std::string& var_name, const std::string& out_varname,
+    const std::string& rpc_path, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_varname;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  const std::string method = "GetRPC";
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
-    // prepare input
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_trainer_id(trainer_id_);
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO(
+      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
+        // prepare input
+        sendrecv::VariableMessage req;
+        req.set_varname(var_name_val);
+        req.set_out_varname(out_varname_val);
+        req.set_trainer_id(trainer_id_);
+        ::grpc::ByteBuffer buf;
+        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
+        // stub context
+        s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call =
-        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+        auto call =
+            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+        call->StartCall();
+        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-  });
+        if (UNLIKELY(platform::IsProfileEnabled())) {
+          h->Wait();
+        }
+      });
 
   req_count_++;
 
@@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
@@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "BatchBarrierRPC";
+  const std::string method = kBatchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 int64_t time_out) {
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendMonomerFetchBarrierRPC";
+  const std::string method = kSendMonomerFetchBarrierRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendCompleteRPC";
+  const std::string method = kSendCompleteRPC;
   VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
 
-  const std::string method = "CheckPointNotifyRPC";
+  const std::string method = kCheckPointNotifyRPC;
 
   VarHandlePtr h(
       new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index fa77d21257647b23b8ac9f8161a216d36d7df773..ce0d2152aa27c62b6e12881aaf2ae458597e67e6 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -186,8 +186,15 @@ class GRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_varname,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
@@ -228,11 +235,11 @@ class GRPCClient : public RPCClient {
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name, const std::string& rpc,
-                            int64_t time_out);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& method,
+      const std::string& var_name, const std::string& out_varname,
+      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 08f777e279e34da0c0ac89afd3f660fa089599fe..4a9c158cb0ab7f2d6fecbba9f957ae6ef153074c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -136,17 +136,65 @@ class RequestGet final : public RequestBase {
   void Process() override {
     // proc request.
     std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
     int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+
+    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
 
     auto scope = request_handler_->scope();
-    auto invar = scope->FindVar(varname);
+    framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
 
     if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestGetNoBarrier final : public RequestBase {
+ public:
+  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
+                               ::grpc::ServerCompletionQueue* cq,
+                               RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetNoBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
+    int trainer_id = request_.trainer_id();
+
+    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
+
+    auto scope = request_handler_->scope();
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
+
+    if (outvar) {
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
     Finish(reply_, &responder_);
@@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+
+  } else if (rpc_name == kRequestGetNoBarrier) {
+    b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGetMonomerVariable) {
     b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
                                       this);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 0b5c5151e637f0d7aeafaefefb01006ffe0f05c8..2965fe4490bedd0253682f0aef44e096232fc2fc 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -81,6 +81,7 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetVariableNoBarrier,
   kGetMonomerVariable,
   kGetMonomerBarrier,
 };
@@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetVariableNoBarrier:
+      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
     case GrpcMethod::kGetMonomerVariable:
       return "/sendrecv.SendRecvService/GetMonomerVariable";
     case GrpcMethod::kGetMonomerBarrier:
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
index 27ca1f4edc04f5fca54b1a6340243634a596939c..e9f06f54327875c0568c571627e9effb998e15be 100644
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -85,7 +85,7 @@ class ProtoEncodeHelper {
 #define REPLACE_ENFORCE_GLOG 1
     // Make sure callers didn't do operations that went over max_size promised
     if (paddle::platform::is_error(p_ <= limit_)) {
-      paddle::platform::throw_on_error(p_ <= limit_);
+      paddle::platform::throw_on_error(p_ <= limit_, "");
     }
 #undef REPLACE_ENFORCE_GLOG
   }
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 62b24f150b41efead24c8bdbe08c9b44e160445a..991158ac72007efc1233f852caed4f90f35fe1cd 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
+
+constexpr char kSendRPC[] = "SendRPC";
+constexpr char kGetRPC[] = "GetRPC";
+constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
+constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
+constexpr char kPrefetchRPC[] = "PrefetchRPC";
+constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
+constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
+constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
+constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
+constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 9722f8c96e91d2dfbe929dcc11645a40c44afb4e..a1c5c0777402b808eed6306862fd6dd41b529dbd 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -53,6 +54,11 @@ bool RequestSendHandler::Handle(const std::string& varname,
     // Async
     if (!sync_mode_) {
       VLOG(3) << "async process var: " << varname;
+      if (varname == BATCH_BARRIER_MESSAGE) {
+        PADDLE_THROW(
+            "async mode should not recv BATCH_BARRIER_MESSAGE or "
+            "COMPLETE_MESSAGE");
+      }
       try {
         executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                       scope);
@@ -81,7 +87,8 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(4) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -112,6 +119,32 @@ bool RequestGetHandler::Handle(const std::string& varname,
   return true;
 }
 
+bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
+                                        framework::Scope* scope,
+                                        framework::Variable* invar,
+                                        framework::Variable** outvar,
+                                        const int trainer_id,
+                                        const std::string& out_var_name,
+                                        const std::string& table_name) {
+  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
+          << " out_var_name: " << out_var_name;
+
+  // get var from pserver immediately without barriers
+  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
+  string::Piece var_name_piece = string::Piece(varname);
+
+  if (string::Contains(var_name_piece, without_barrier_piece)) {
+    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
+    VLOG(4) << "Get var " << var_name_piece << " with "
+            << WITHOUT_BARRIER_MESSAGE;
+    *outvar = scope_->FindVar(var_name_piece.ToString());
+    return true;
+  } else {
+    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
+  }
+  return true;
+}
+
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Scope* scope,
                                     framework::Variable* invar,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 5e0b25c5c2ce161dee0948a07baab32dfff9be6f..f3c1b24526b8b28033c0c979f74d44a3d7a94201 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler {
   bool enable_dc_asgd_;
 };
 
+class RequestGetNoBarrierHandler final : public RequestHandler {
+ public:
+  RequestGetNoBarrierHandler() : RequestHandler(false) {}
+  virtual ~RequestGetNoBarrierHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 static inline void BuildVar(const std::string& param_name,
                             std::initializer_list<const char*> arguments,
                             paddle::framework::proto::OpDesc::Var* var) {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index b668d869787a47ebd36f570061421ddbeae5a09a..ea54e0c2951253fc009672f4cd2e5233ed56944e 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -43,8 +43,15 @@ class RPCClient {
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
+                                   const std::string& out_varname,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index cc5b9c29a12ec5386041dfeea22fd388d94115e6..c3a46e348c69a20953f013c7de772a37db5f4844 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
   port_file.open(file_path);
   port_file << selected_port_;
   port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  VLOG(3) << "selected port written to " << file_path;
 }
 
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  VLOG(3) << "WaitBarrier in: " << rpc_name;
   std::unique_lock<std::mutex> lock(this->mutex_);
   barrier_cond_.wait(lock, [this, &rpc_name] {
     return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
             exit_flag_.load());
   });
 
-  VLOG(3) << "batch_barrier_: " << rpc_name << " "
-          << barrier_counter_[rpc_name];
+  VLOG(3) << "WaitBarrier out: " << rpc_name
+          << " counter: " << barrier_counter_[rpc_name];
 }
 
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  // barrier msg should make sure that it's in the right cond(send|recv)
+  WaitCond(rpc_name);
   int b = 0;
   std::unique_lock<std::mutex> lock(mutex_);
   b = ++barrier_counter_[rpc_name];
+  VLOG(3) << rpc_name << " barrier_counter: " << b;
   if (b >= client_num_) {
     lock.unlock();
+    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
+            << rpc_name;
     barrier_cond_.notify_all();
     lock.lock();
   }
@@ -71,7 +77,7 @@ void RPCServer::Complete() {
     client_num_--;
     need_reset_all_vars_ = true;
 
-    VLOG(4) << "decrease client_num to: " << client_num_;
+    VLOG(3) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
     }
@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
 
   static int cond = -1;
   rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
+  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
+          << ", cond: " << rpc_cond_map_[rpc_name];
 }
 
 void RPCServer::SetCond(const std::string& rpc_name) {
@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
   int cond = 0;
   {
     std::unique_lock<std::mutex> lock(mutex_);
@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
   std::unique_lock<std::mutex> lock(mutex_);
   rpc_cond_.wait(
       lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
 }
 
 void RPCServer::RegisterVar(const std::string& var_name,
@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
   }
 
   rpc_cond_.notify_all();
-  VLOG(4) << "RegisterVar context:" << h.String();
+  VLOG(3) << "RegisterVar context:" << h.String();
 }
 
 void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
     barrier_cond_.notify_all();
   }
 
-  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
 }
 
 void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(4) << "WaitBarrier var_name:" << var_name;
+  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
 
   std::unique_lock<std::mutex> lock(mutex_);
   barrier_cond_.wait(lock, [&]() {
@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
             exit_flag_.load());
   });
 
-  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
 }
 
 void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(4) << "SetVarCond var_name:" << var_name;
+  VLOG(3) << "SetVarCond var_name:" << var_name;
   {
     std::unique_lock<std::mutex> lock(mutex_);
     if (var_map_.find(var_name) != var_map_.end()) {
@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
 }
 
 void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(4) << "WaitVarCond var_name:" << var_name;
+  VLOG(3) << "WaitVarCond var_name:" << var_name;
 
   std::unique_lock<std::mutex> lock(mutex_);
   rpc_cond_.wait(lock, [=] {
     return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
   });
 
-  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
 }
 
 MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index b39eef04d8d1de77cb951f90a10e69eebb495282..6303667884361be050ac62c604274c87caa72444 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -17,8 +17,14 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -27,12 +33,17 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
+// It can be: LoDTensor、SelectedRows or NCCL_ID
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
+// VariableMessage is serialized paddle variable message.
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -49,14 +60,21 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
+  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
+  // tensor data
   bytes serialized = 8;
+  // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 47ff568a1135f2f0a146faa4d5d6fc422a344f51..7825b4fc82b1f7580fea8ab4961facaf7fd64397 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
       tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
 
   VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length;
-  PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
+          << ", Buffer Size = " << length << ", dims:" << dims
+          << ", numel:" << tensor->numel();
+  PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast<unsigned int>(length));
   return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
 }
 
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 629f364d712694d2bc1d2483711f5247561ed1da..5b30ed472d51a37a0705d1717395da9e4ff7d743 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
+    VLOG(3) << "wait all clients to send gradient";
     rpc_service_->SetCond(distributed::kRequestSend);
+    VLOG(3) << "wait all clients to send send_barrier";
     rpc_service_->WaitBarrier(distributed::kRequestSend);
 
     if (rpc_service_->IsExit()) {
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
-    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
+    VLOG(3) << "ResetReceivedVars";
     ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
+    VLOG(3) << "wait all clients to get parameters back";
     rpc_service_->SetCond(distributed::kRequestGet);
+    VLOG(3) << "wait all clients to send fetch_barrier";
     rpc_service_->WaitBarrier(distributed::kRequestGet);
+    VLOG(3) << "ResetBarrierCounter";
     rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
@@ -347,6 +353,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestPrefetchHandler(sync_mode));
   request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
       sync_mode, checkpoint_block_id));
+  request_get_no_barrier_handler_.reset(
+      new distributed::RequestGetNoBarrierHandler());
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(),
@@ -359,6 +367,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
+                            request_get_no_barrier_handler_.get());
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -413,6 +423,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
   f(request_checkpoint_handler_.get());
+  f(request_get_no_barrier_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 9431978df836121baacc12ed4e1ee6b218cc7d7a..f20442bad7c5bd96173b9d6efc4dceb13feacf5b 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& inputs,
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
-
   virtual ~ListenAndServOp();
 
   void RunSyncLoop(framework::Executor* executor,
@@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_get_no_barrier_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
       request_prefetch_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
index 99c57590191d58a12760fb335df76037685d1ced..05c00251b97bb5071102a43208c1fbbfa4ef8d2d 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                       "the number of Ids and Out should be the same");
 
-    size_t row_ids_size = 0;
-    int row_size = 0;
-    int embedding_size = 0;
+    int64_t row_ids_size = 0;
+    int64_t row_size = 0;
+    int64_t embedding_size = 0;
 
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *x_tensor = x_tensors[i];
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *row_id = row_ids[i];
 
-      for (int j = 0; j < row_id->numel(); ++j) {
+      for (auto j = 0; j < row_id->numel(); ++j) {
         int64_t key = row_id->data<int64_t>()[j];
         std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
         selected_rows_idx_map.insert(std::make_pair(key, val));
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
 
       out->set_lod(out_ids->lod());
 
-      int nums = static_cast<int>(out_ids->dims()[0]);
+      auto nums = out_ids->dims()[0];
       auto *out_data = out->mutable_data<T>(
           framework::make_ddim({nums, embedding_size}), place);
-      for (int j = 0; j < nums; ++j) {
-        int id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map[id];
-        int64_t row_idx = std::get<1>(row_tuple);
+      for (auto j = 0; j < nums; ++j) {
+        auto id = out_ids->data<int64_t>()[j];
+        auto row_tuple = selected_rows_idx_map.at(id);
+        auto row_idx = std::get<1>(row_tuple);
         const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
 
         memcpy(out_data + embedding_size * j,
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 48065437e38b2c5457c135cce03075f81110a329..120c65f29699bf2745b09ea312d1de069c8173c5 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -27,30 +27,50 @@ namespace operators {
 
 class RecvOp : public framework::OperatorBase {
  public:
-  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto outs = Outputs("Out");
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> varnames =
+        Attr<std::vector<std::string>>("varnames");
     int sync_mode = Attr<int>("sync_mode");
+    auto outs = Outputs("Out");
+    bool with_barrier = Attr<bool>("with_barrier");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
 
-    distributed::RPCClient* rpc_client =
+    distributed::RPCClient *rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
-    }
-    if (sync_mode) {
+    if (with_barrier) {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVar";
+        rets.push_back(
+            rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
+      }
+      if (sync_mode) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        }
+      }
+    } else {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVarNoBarrier";
+        rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
+                                                        varname, outs[i]));
+      }
       for (size_t i = 0; i < rets.size(); i++) {
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
       }
@@ -79,12 +99,23 @@ This operator can get variables from server side.
                  "(int, default 0)"
                  "sync recv or async recv.")
         .SetDefault(0);
+    AddAttr<bool>("with_barrier",
+                  "(bool, default True) if with_barrier=False, will use "
+                  "AsyncGetVarNoBarrier get variable from pserver immediately")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>(
+        "varnames",
+        "(string vector, default {}) "
+        "sometimes we need to put received var in another name "
+        "for example: we need var named 'moment_1@127.0.0.1:1001', "
+        "and it real name on parameter server is 'moment_1'. ")
+        .SetDefault({});
   }
 };
 
 class RecvOpShapeInference : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
+  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index d65491267de1ce3495d8b8250cf0cff570dfcc6a..7a6927d3e54b4ece8f17d7a1e7e431ba836edff9 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -114,4 +114,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
+    ops::DropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7bb6934e1496cc989eee8ba82f56959522803bfb..cb8a4e7e1502e7e6ceb48e51452c2c7ab8313972 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -277,68 +277,6 @@ class TransformFunctor {
   Functor func_;
 };
 
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor *x, const framework::Tensor *y,    \
-                    framework::Tensor *z,                                      \
-                    const framework::ExecutionContext &ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_e);                                                  \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor *x,                       \
-                             const framework::Tensor *y, framework::Tensor *z, \
-                             const framework::ExecutionContext &ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor *x,                      \
-                              const framework::Tensor *y,                      \
-                              framework::Tensor *z,                            \
-                              const framework::ExecutionContext &ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-  }
-
-#define EIGEN_ADD(x, y) ((x) + (y))
-
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-
-#define EIGEN_SUB(x, y) ((x) - (y))
-
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-
-#define EIGEN_MUL(x, y) ((x) * (y))
-
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-
-#define EIGEN_DIV(x, y) ((x) / (y))
-
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
-
 template <typename T, typename DX_OP, typename DY_OP>
 struct ElemwiseGradNoBroadcast {
   const T *x_;
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc
rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index c72a966c575d4a63471905b82643e96454f08187..6e13887866485bd114ebf12f4bdfa8d60fca6d01 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     out_datas.push_back(
         static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
 
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
-
     for (int i = 0; i < 4; ++i) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
-          static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
-          algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i],
-          out_datas[i], bias_desc[i],
-          static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
-          out_desc[i], out_datas[i]));
+      auto func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
+            out_desc[i], out_datas[i], bias_desc[i],
+            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i]));
+      };
+      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle.RunFunc(func, workspace_size_in_bytes);
     }
 
     cudnnTensorDescriptor_t x_desc;
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
 }
 
 template <typename T>
-static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n,
-                    int k) {
+static void fc_relu(const T* x, const T* w, const T* b, T* y,
+                    const jit::matmul_attr_t& attr) {
   auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
   auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n);
-  matmul(x, w, y, m, n, k);
+      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+  matmul(x, w, y, &attr);
   T* dst = y;
-  for (int i = 0; i < m; ++i) {
-    addbias_relu(b, dst, dst, n);
-    dst += n;
+  for (int i = 0; i < attr.m; ++i) {
+    addbias_relu(b, dst, dst, attr.n);
+    dst += attr.n;
   }
 }
 
@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 
     auto i_dims = in->dims();
     auto w_dims = weights[0]->dims();
-    int m = i_dims[0];
-    int n = w_dims[1];
-    int k = w_dims[0];
-    relus[0]->Resize({m, n});
+    jit::matmul_attr_t attr;
+    attr.m = i_dims[0];
+    attr.n = w_dims[1];
+    attr.k = w_dims[0];
+    relus[0]->Resize({attr.m, attr.n});
     fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
-            relus[0]->mutable_data<T>(place), m, n, k);
+            relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
       auto i_dims = relus[i - 1]->dims();
       auto w_dims = weights[i]->dims();
-      int m = i_dims[0];
-      int n = w_dims[1];
-      int k = w_dims[0];
-      relus[i]->Resize({m, n});
+      attr.m = i_dims[0];
+      attr.n = w_dims[1];
+      attr.k = w_dims[0];
+      relus[i]->Resize({attr.m, attr.n});
       fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
-              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k);
+              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
     auto i_dims_last = relus[weight_sz - 2]->dims();
     auto w_dims_last = weights[weight_sz - 1]->dims();
-    m = i_dims_last[0];
-    n = w_dims_last[1];
-    k = w_dims_last[0];
+    attr.m = i_dims_last[0];
+    attr.n = w_dims_last[1];
+    attr.k = w_dims_last[0];
     fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
-            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n,
-            k);
+            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
+            attr);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 
     auto x_dims = x->dims();
     auto y_dims = y->dims();
-    int m = x_dims[0];
-    int k = x_dims[1];
-    int n = y_dims[1];
-    int o_numel = m * n;
+    jit::matmul_attr_t attr;
+    attr.m = x_dims[0];
+    attr.k = x_dims[1];
+    attr.n = y_dims[1];
+    int o_numel = attr.m * attr.n;
 
     auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
+                                                                       attr.k);
     auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
+                                                                       attr.n);
     auto vsquare_xy =
         jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
     auto vsub =
@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     auto vscal =
         jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
     auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
 
     const T* x_data = x->data<T>();
     const T* y_data = y->data<T>();
@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     T* squared_xy_data = squared_xy->mutable_data<T>(place);
     T* o_data = out->mutable_data<T>(place);
 
-    matmul(x_data, y_data, squared_xy_data, m, n, k);
+    matmul(x_data, y_data, squared_xy_data, &attr);
     vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
 
-    vsquare_x(x_data, squared_x_data, m * k);
-    vsquare_y(y_data, squared_y_data, k * n);
-    matmul(squared_x_data, squared_y_data, o_data, m, n, k);
+    vsquare_x(x_data, squared_x_data, attr.m * attr.k);
+    vsquare_y(y_data, squared_y_data, attr.k * attr.n);
+    matmul(squared_x_data, squared_y_data, o_data, &attr);
 
     vsub(squared_xy_data, o_data, o_data, o_numel);
     vscal(&scalar, o_data, o_data, o_numel);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 0a8c0814a7d472bb1b527a4df470a34dcaf00e81..55cef93aacd43174edefbb8aa740bcbea3d8feef 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -103,8 +103,10 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<uint8_t>,
                        ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
+                       ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 9f4aef08cd58e72ce344a640e6564b9e360ce169..490ba9a585ee8fac82a9e1178f506a6d39e5fd1c 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     GPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<Tensor>("X");
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
@@ -61,11 +60,14 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>);
+                        ops::GatherOpCUDAKernel<int>,
+                        ops::GatherOpCUDAKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                         ops::GatherGradOpCUDAKernel<double>,
                         ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>);
+                        ops::GatherGradOpCUDAKernel<int>,
+                        ops::GatherGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 2dd726bebb1bc2e4d83844c0b98df01c390e622f..2e18298cf8e34d5f70369c89b3b3b2a9ced0ce62 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 14a2524bd8f4a9f7685c84f1d9767f5f7eedf0e7..241184c6f4a19a1da0d6d75c5d4e2b372c14e9da 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
     PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
                       "Input(X) and Input(Grid) dims[0] should be equal.");
-    PADDLE_ENFORCE_EQ(
-        grid_dims[1], x_dims[2],
-        "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
-    PADDLE_ENFORCE_EQ(
-        grid_dims[2], x_dims[3],
-        "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          grid_dims[1], x_dims[2],
+          "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
+      PADDLE_ENFORCE_EQ(
+          grid_dims[2], x_dims[3],
+          "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+    }
 
     ctx->SetOutputDim("Output", x_dims);
     ctx->ShareLoD("X", "Output");
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 1a7ca963010112bbcab69f1ceeb9cb8d19ca9b9e..4d5a84bcafed1ab0739349e1dbc7b5a9f9ad64ec 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -136,7 +136,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
     auto sum_mat = EigenMatrix<T>::From(sum);
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    auto out_mat = framework::EigenMatrix<T>::From(*out);
     if (bias) {
       bit_code->Add(*bias, pre_out);
     }
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 93dd3f794f6087a3158fee1f262795871f21611a..de91ba6270ac2ed22c8380878c0a0037fb1629c0 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -82,6 +82,18 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
                          "bilinear interpolation and \"nearest\" for nearest "
                          "neighbor interpolation.")
         .SetDefault("bilinear");
+    AddAttr<bool>(
+        "align_corners",
+        "an optinal bool. Defaults to True. "
+        "If True, the centers of 4 corner pixels of the input and output "
+        "tensors are aligned, preserving the values at the corner pixels, "
+        "if Flase, are not aligned")
+        .SetDefault(true);
+    AddAttr<int>("align_mode",
+                 "(int, default \'1\'), optional for bilinear interpolation"
+                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
+                 "can be \'1\' for src_idx = scale*dst_index .")
+        .SetDefault(1);
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
@@ -98,6 +110,64 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           to perform linear interpolation first in one direction, and then 
           again in the other direction.
 
+          Align_corners and align_mode are optinal parameters,the calculation method 
+          of interpolation can be selected by them.
+          
+          Example:
+
+          For scale:
+          
+            if align_corners = True and out_{size}>1 :
+
+              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
+            
+            else:
+              
+              scale_{factor} = float(in_{size}/out_{size})
+            
+          
+          Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+          else:
+              align_corners = True
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+
+          Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          
+
           For details of nearest neighbor interpolation, please refer to Wikipedia: 
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 99ac725f73bf60ab0fb9a467432e8a57c646ef35..b887878ea2291d6c56fec91738784e338606b84f 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -23,7 +23,8 @@ __global__ void KeNearestNeighborInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
     const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
@@ -35,10 +36,14 @@ __global__ void KeNearestNeighborInterpFw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
 
     out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
                   in_img_idy * in_img_w + in_img_idx];
@@ -50,7 +55,8 @@ __global__ void KeNearestNeighborInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
@@ -62,10 +68,14 @@ __global__ void KeNearestNeighborInterpBw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
 
     T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
                     in_img_idy * in_img_w + in_img_idx];
@@ -79,10 +89,12 @@ __global__ void KeBilinearInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
     const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (; tid < nthreads; tid += stride) {
     int out_id_h = tid / output_w;
     int out_id_w = tid % output_w;
@@ -91,15 +103,23 @@ __global__ void KeBilinearInterpFw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
     int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
+                            : ratio_h * out_img_idy - in_img_idy;
     T h2lambda = 1.f - h1lambda;
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratio_w * out_img_idx;
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
     int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
+                            : ratio_w * out_img_idx - in_img_idx;
     T w2lambda = 1.f - w1lambda;
 
     const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
@@ -118,10 +138,12 @@ __global__ void KeBilinearInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratio_w) {
+    const size_t num_channels, const T ratio_h, const T ratio_w,
+    const bool align_corners, const int align_mode) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (; tid < nthreads; tid += stride) {
     int out_id_h = tid / output_w;
     int out_id_w = tid % output_w;
@@ -130,15 +152,22 @@ __global__ void KeBilinearInterpBw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
+    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
+                                : ratio_h * out_img_idy;
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
     int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
+                            : ratio_h * out_img_idy - in_img_idy;
+
     T h2lambda = 1.f - h1lambda;
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratio_w * out_img_idx;
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
     int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
+                            : ratio_w * out_img_idx - in_img_idx;
     T w2lambda = 1.f - w1lambda;
 
     T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
@@ -175,6 +204,9 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
       out_w = size_data[1];
     }
 
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+
     int n = input->dims()[0];
     int c = input->dims()[1];
     int in_h = input->dims()[2];
@@ -188,10 +220,16 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     int in_chw = c * in_hw;
     int out_chw = c * out_hw;
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if (in_h == out_h && in_w == out_w) {
       framework::TensorCopy(*input, ctx.GetPlace(), output);
@@ -206,12 +244,12 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
       KeNearestNeighborInterpFw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-          out_chw, c, ratio_h, ratio_w);
+          out_chw, c, ratio_h, ratio_w, align_corners);
     } else if ("bilinear" == interp_method) {
       KeBilinearInterpFw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-          out_chw, c, ratio_h, ratio_w);
+          out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
@@ -234,6 +272,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
     auto out_size = ctx.Input<Tensor>("OutSize");
+
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
@@ -252,10 +294,16 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     int in_chw = c * in_hw;
     int out_chw = c * out_hw;
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if (in_h == out_h && in_w == out_w) {
       framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
@@ -270,12 +318,12 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
       KeNearestNeighborInterpBw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
-          out_w, n, out_chw, c, ratio_h, ratio_w);
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners);
     } else if ("bilinear" == interp_method) {
       KeBilinearInterpBw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
-          out_w, n, out_chw, c, ratio_h, ratio_w);
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 7fdb3e1f5a2ff82284d89dd0759e357978e1d873..c631ad1dd158ce114169602f073d69b2291b5b3b 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -26,14 +26,17 @@ template <typename T>
 static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
                                        const float ratio_h, const float ratio_w,
                                        const int n, const int c,
-                                       const int out_h, const int out_w) {
+                                       const int out_h, const int out_w,
+                                       const bool align_corners) {
   auto input_t = EigenTensor<T, 4>::From(input);
   auto output_t = EigenTensor<T, 4>::From(*output);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = static_cast<int>(ratio_h * k + 0.5);
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
 
     for (int l = 0; l < out_w; l++) {
-      int in_l = static_cast<int>(ratio_w * l + 0.5);
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
 
       for (int i = 0; i < n; i++) {    // loop for batches
         for (int j = 0; j < c; j++) {  // loop for channels
@@ -48,20 +51,29 @@ template <typename T>
 static void BilinearInterpolation(const Tensor& input, Tensor* output,
                                   const float ratio_h, const float ratio_w,
                                   const int in_h, const int in_w, const int n,
-                                  const int c, const int out_h,
-                                  const int out_w) {
+                                  const int c, const int out_h, const int out_w,
+                                  const bool align_corners,
+                                  const bool align_mode) {
   auto input_t = EigenTensor<T, 4>::From(input);
   auto output_t = EigenTensor<T, 4>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = static_cast<int>(ratio_h * k);
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
     int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float d_n = ratio_h * k - y_n;
+    float d_n =
+        align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
     float d_s = 1.f - d_n;
 
     for (int l = 0; l < out_w; l++) {
-      int x_w = static_cast<int>(ratio_w * l);
+      int x_w = (align_mode == 0 && !align_corners)
+                    ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                    : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
       int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float d_w = ratio_w * l - x_w;
+      float d_w =
+          align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
       float d_e = 1.f - d_w;
 
       for (int i = 0; i < n; i++) {    // loop for batches
@@ -78,19 +90,20 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
-                                           Tensor* input_grad,
-                                           const float ratio_h,
-                                           const float ratio_w, const int n,
-                                           const int c, const int out_h,
-                                           const int out_w) {
+static void NearestNeighborInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int n, const int c, const int out_h,
+    const int out_w, const bool align_corners) {
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = static_cast<int>(ratio_h * k + 0.5);
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
 
     for (int l = 0; l < out_w; l++) {
-      int in_l = static_cast<int>(ratio_w * l + 0.5);
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
 
       for (int i = 0; i < n; i++) {    // loop for batches
         for (int j = 0; j < c; j++) {  // loop for channels
@@ -106,19 +119,28 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
                                       Tensor* input_grad, const float ratio_h,
                                       const float ratio_w, const int in_h,
                                       const int in_w, const int n, const int c,
-                                      const int out_h, const int out_w) {
+                                      const int out_h, const int out_w,
+                                      const bool align_corners,
+                                      const int align_mode) {
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = static_cast<int>(ratio_h * k);
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
     int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float d_n = ratio_h * k - y_n;
+    float d_n =
+        align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
     float d_s = 1.f - d_n;
 
     for (int l = 0; l < out_w; l++) {
-      int x_w = static_cast<int>(ratio_w * l);
+      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                           : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
       int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float d_w = ratio_w * l - x_w;
+      float d_w =
+          align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
       float d_e = 1.f - d_w;
 
       for (int i = 0; i < n; i++) {    // loop for batches
@@ -134,7 +156,6 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
     }
   }
 }
-
 template <typename T>
 class InterpolateKernel : public framework::OpKernel<T> {
  public:
@@ -151,6 +172,8 @@ class InterpolateKernel : public framework::OpKernel<T> {
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     const int n = input->dims()[0];
     const int c = input->dims()[1];
@@ -168,17 +191,24 @@ class InterpolateKernel : public framework::OpKernel<T> {
       return;
     }
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if ("bilinear" == interp_method) {
       BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
-                               c, out_h, out_w);
+                               c, out_h, out_w, align_corners, align_mode);
     } else if ("nearest" == interp_method) {
       NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
-                                    out_h, out_w);
+                                    out_h, out_w, align_corners);
     }
   }
 };
@@ -200,6 +230,8 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     const int n = input->dims()[0];
     const int c = input->dims()[1];
@@ -217,17 +249,26 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
       return;
     }
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if ("bilinear" == interp_method) {
       BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
-                                   in_h, in_w, n, c, out_h, out_w);
+                                   in_h, in_w, n, c, out_h, out_w,
+                                   align_corners, align_mode);
     } else if ("nearest" == interp_method) {
       NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
-                                        ratio_w, n, c, out_h, out_w);
+                                        ratio_w, n, c, out_h, out_w,
+                                        align_corners);
     }
   }
 };
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index 262094f9224407bb412f5b189a748efe13cb04b2..35775d7ec9efcdbad69e4491792f7d4e513832ad 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
 endif()
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 74d6a87247821eb1d17cc97b8d8b4bcf1c832f79..1b9360afcecf63ff0c3e306cdf303cc426e80f1e 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/place.h"
@@ -155,53 +156,89 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   LOG(INFO) << loginfos.str();
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+using Tensor = paddle::framework::Tensor;
+
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYZNKernel() {
   for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d), z(d);
-    RandomVec<T>(d, x.data());
-    RandomVec<T>(d, y.data());
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(),
-                                                     z.data(), d);
+    Tensor x, y, z;
+    x.Resize({d});
+    y.Resize({d});
+    z.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    T* z_data = z.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    RandomVec<T>(d, y_data);
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
+                                                     y.data<T>(), z_data, d);
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchAXYNKernel() {
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
-    std::vector<T> x(d), y(d);
-    RandomVec<T>(d, x.data());
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(),
+    Tensor x, y;
+    x.Resize({d});
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
+                                                     d);
+    // test inplace
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
                                                      d);
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchXRNKernel() {
+  for (int d : TestSizes()) {
+    Tensor x;
+    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
+    T res;
+    BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYNKernel() {
   for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d);
-    RandomVec<T>(d, x.data());
-    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d);
+    Tensor x, y;
+    x.Resize({d});
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchLSTMKernel() {
   for (bool use_peephole : {true, false}) {
     for (int d : TestSizes()) {
       const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
                                   use_peephole);
-      std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d);
-      RandomVec<T>(4 * d, x.data(), -2.f, 2.f);
-      RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
-      RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
-      const T* ct_1_data = ct_1.data();
-      const T* wp_data = wp.data();
-      T* x_data = x.data();
-      T* checked_data = checked.data();
-      T* ct_data = ct.data();
-      T* ht_data = ht.data();
+      Tensor x, ct_1, ct, ht, wp, checked;
+      x.Resize({4 * d});
+      ct_1.Resize({d});
+      ct.Resize({d});
+      ht.Resize({d});
+      wp.Resize({3 * d});
+      checked.Resize({2 * d});
+      auto place = PlaceType();
+      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
+      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
+      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
+      const T* ct_1_data = ct_1.data<T>();
+      const T* wp_data = wp.data<T>();
+      T* x_data = x.mutable_data<T>(place);
+      T* checked_data = checked.mutable_data<T>(place);
+      T* ct_data = ct.mutable_data<T>(place);
+      T* ht_data = ht.mutable_data<T>(place);
       jit::lstm_t step;
       step.gates = x_data;
       step.ct_1 = ct_1_data;
@@ -216,16 +253,20 @@ void BenchLSTMKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchGRUKernel() {
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
-    std::vector<T> x(3 * d), ht_1(d), ht(d);
-    RandomVec<T>(3 * d, x.data(), -2.f, 2.f);
-    RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
-    const T* ht_1_data = ht_1.data();
-    T* x_data = x.data();
-    T* ht_data = ht.data();
+    auto place = PlaceType();
+    Tensor x, ht_1, ht;
+    x.Resize({3 * d});
+    ht_1.Resize({d});
+    ht.Resize({d});
+    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
+    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
+    const T* ht_1_data = ht_1.data<T>();
+    T* x_data = x.mutable_data<T>(place);
+    T* ht_data = ht.mutable_data<T>(place);
     jit::gru_t step;
     step.gates = x_data;
     step.ht_1 = ht_1_data;
@@ -234,7 +275,7 @@ void BenchGRUKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchSeqPoolKernel() {
   std::vector<jit::SeqPoolType> pool_types = {
       jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
@@ -243,10 +284,12 @@ void BenchSeqPoolKernel() {
       jit::seq_pool_attr_t attr(w, type);
       for (int h : TestSizes()) {
         attr.h = h;
-        std::vector<T> x(h * w), y(w);
-        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
-        const T* x_data = x.data();
-        T* y_data = y.data();
+        Tensor x, y;
+        x.Resize({h * w});
+        y.Resize({w});
+        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* x_data = x.data<T>();
+        T* y_data = y.mutable_data<T>(PlaceType());
         BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
                                                             y_data, &attr);
       }
@@ -254,75 +297,86 @@ void BenchSeqPoolKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
       for (int k : TestSizes()) {
-        std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
-        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
-        const T* a_data = a.data();
-        const T* b_data = b.data();
-        T* c_data = c.data();
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
-                                                           c_data, m, n, k);
+        Tensor a, b, c;
+        a.Resize({m * k});
+        b.Resize({k * n});
+        c.Resize({m * n});
+        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* a_data = a.data<T>();
+        const T* b_data = b.data<T>();
+        T* c_data = c.mutable_data<T>(PlaceType());
+        const jit::matmul_attr_t attr{m, n, k};
+        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
+                                                           c_data, &attr);
       }
     }
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchSoftmaxKernel() {
+  for (int bs : {1, 2, 10}) {
+    for (int n : TestSizes()) {
+      Tensor x, y;
+      x.Resize({bs, n});
+      y.Resize({bs, n});
+      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      const T* x_data = x.data<T>();
+      T* y_data = y.mutable_data<T>(PlaceType());
+      BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
+                                                          bs);
+    }
+  }
+}
+
 using T = float;
-using PlaceType = paddle::platform::CPUPlace;
+using CPUPlace = paddle::platform::CPUPlace;
 
 // xyzn
-BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
+BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
 
 // axyn
-BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
+BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
 
-BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
+// xrn
+BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
+BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
 
 // xyn
-BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
+BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
 
 // lstm and peephole
-BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
+BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
+BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
 
 // gru functions
-BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kGRUHtPart1) {
-  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
-}
-
-BENCH_FP32_CPU(kGRUHtPart2) {
-  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
-}
+BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
+BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
+BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
 
 // seq pool function
-BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
+BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
 
 // matmul
-BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
+BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
+
+// softmax
+BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
 
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index 40310c2d2b372a414054f75348e8e1b4471bf3d2..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
 endfunction()
 
 # use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kMatMul)
 USE_JITKERNEL_GEN(kVMul)
 USE_JITKERNEL_GEN(kVAdd)
 USE_JITKERNEL_GEN(kVSub)
@@ -28,3 +29,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
+USE_JITKERNEL_GEN(kHMax)
+USE_JITKERNEL_GEN(kHSum)
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index a2a5661b93ad3d885983c502566860aa313d110f..e7a7375879064eb27c94315fe7b93eece7866b92 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -81,9 +81,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
-      return platform::MayIUse(platform::avx);                               \
-    }                                                                        \
+    bool UseMe(const int& attr) const override;                              \
     size_t CodeSize(const int& d) const override;                            \
     std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
       return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);
 
 // TODO(TJ): tuning use me
+bool VReluCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VSquareCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VIdentityCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VExpCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx) && d < 32;
+}
+
+bool VSigmoidCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VTanhCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
 size_t VReluCreator::CodeSize(const int& d) const {
   return 96 /* init size */ +
          (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7884017198623d996fe98a55691da6e342d656a
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/hopv.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void HOPVJitCode::genCode() {
+  const int num_blocks = num_ / YMM_FLOAT_BLOCK;
+  int offset = 0;
+
+  if (num_blocks > 0) {
+    // load one firstly
+    vmovups(ymm_tmp, ptr[param_src]);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+    for (int i = 1; i < num_blocks; ++i) {
+      vmovups(ymm_src, ptr[param_src + offset]);
+      process(ymm_tmp, ymm_src, ymm_tmp);
+      offset += sizeof(float) * YMM_FLOAT_BLOCK;
+    }
+    vextractf128(xmm_dst, ymm_tmp, 1);
+    process(xmm_dst, xmm_dst, xmm_tmp);
+  } else {
+    if (type_ == operand_type::MAX) {
+      vbroadcastss(ymm_dst, ptr[param_src]);
+    } else if (type_ == operand_type::ADD) {
+      vxorps(ymm_dst, ymm_dst, ymm_dst);
+    }
+  }
+
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  if (rest >= 4) {
+    vmovups(xmm_src, ptr[param_src + offset]);
+    offset += sizeof(float) * 4;
+    rest -= 4;
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+
+  vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3);
+  process(xmm_dst, xmm_dst, xmm_tmp);
+
+  if (rest >= 2) {
+    vmovq(xmm_src, ptr[param_src + offset]);
+    offset += sizeof(float) * 2;
+    rest -= 2;
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+
+  vpermilps(xmm_tmp, xmm_dst, 1);
+  process(xmm_dst, xmm_dst, xmm_tmp);
+
+  if (rest >= 1) {
+    vmovss(xmm_src, ptr[param_src + offset]);
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+  vmovss(ptr[param_dst], xmm_dst);
+  ret();
+}
+
+#define DECLARE_HOP_CREATOR(name)                                            \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override {                           \
+      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
+    }                                                                        \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+
+DECLARE_HOP_CREATOR(HMax);
+DECLARE_HOP_CREATOR(HSum);
+
+#undef DECLARE_HOP_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
+REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3bc94b63d3f962cd655367a2afe1a08582b06fa
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+// horizontal operand vector
+class HOPVJitCode : public JitCode {
+ public:
+  explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
+                       void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), num_(d), type_(type) {
+    if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) {
+      LOG(FATAL) << "Do not support this operand type: " << type_;
+    }
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "VXXJitCode";
+    if (type_ == operand_type::MAX) {
+      base += "_MAX";
+    } else {
+      base += "_SUM";
+    }
+    return base.c_str();
+  }
+  void genCode() override;
+
+ protected:
+  template <typename JMM>
+  void process(JMM& dst, JMM& src1, JMM& src2) {  // NOLINT
+    if (type_ == operand_type::MAX) {
+      vmaxps(dst, src1, src2);
+    } else if (type_ == operand_type::ADD) {
+      vaddps(dst, src1, src2);
+    }
+  }
+
+ private:
+  int num_;
+  operand_type type_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_attr{abi_param3};
+
+  ymm_t ymm_tmp = ymm_t(0);
+  ymm_t ymm_src = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(2);
+
+  xmm_t xmm_tmp = xmm_t(0);
+  xmm_t xmm_src = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(2);
+};
+
+#define DECLARE_HOP_JITCODE(name, op_type)                                    \
+  class name##JitCode : public HOPVJitCode {                                  \
+   public:                                                                    \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
+        : HOPVJitCode(d, op_type, code_size, code_ptr) {}                     \
+  };
+
+DECLARE_HOP_JITCODE(HMax, operand_type::MAX);
+DECLARE_HOP_JITCODE(HSum, operand_type::ADD);
+
+#undef DECLARE_HOP_JITCODE
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index f63d40ad5a559ab87a9b3735406671cfd936d9e4..c388109604bc57e8475e79a6c57eecb5bfebfb52 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -47,6 +47,7 @@ using Label = Xbyak::Label;
 
 typedef enum {
   MUL = 0,
+  MAX,
   ADD,
   SUB,
   RELU,
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/matmul.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void MatMulJitCode::genCode() {
+  preCode();
+  int block, rest;
+  const auto groups = packed_groups(n_, k_, &block, &rest);
+  PADDLE_ENFORCE_GT(groups.front(), 0);
+
+  const int block_len = sizeof(float) * block;
+  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
+  const int w_reg_idx = x_reg_idx - 1;
+  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
+  // packed_weight)]);
+  mov(reg_ptr_wgt, param_y);
+  size_t z_offset = 0;
+  size_t wgt_offset = 0;
+  for (size_t g = 0; g < groups.size(); ++g) {
+    size_t x_offset = 0;
+    for (int k = 0; k < k_; ++k) {
+      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
+      // clean
+      if (k == 0) {
+        for (int i = 0; i < groups[g]; ++i) {
+          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
+        }
+      }
+      for (int i = 0; i < groups[g]; ++i) {
+        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
+        wgt_offset += block_len;
+      }
+      // last one, save
+      if (k == k_ - 1) {
+        for (int i = 0; i < groups[g]; ++i) {
+          // only rest save should be careful
+          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
+            break;
+          }
+          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
+        }
+      }
+      x_offset += sizeof(float);
+    }
+    z_offset += block_len * groups[g];
+  }
+
+  if (rest != 0) {
+    // below should refine with mask
+    int reg_idx = groups.back() - 1;
+    z_offset = (n_ - rest) * sizeof(float);
+    int inner_block = 8;
+    while (rest > 0) {
+      if (rest >= 8) {
+        inner_block = 8;
+        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
+        // shift zmm of inner_block, change reg_idx if update
+      } else if (rest >= 4) {
+        inner_block = 4;
+        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else if (rest >= 2) {
+        inner_block = 2;
+        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else {
+        inner_block = 1;
+        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
+      }
+      z_offset += inner_block * sizeof(float);
+      rest -= inner_block;
+    }
+  }
+
+  postCode();
+}
+
+class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
+ public:
+  bool UseMe(const matmul_attr_t& attr) const override {
+    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
+           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
+  }
+  size_t CodeSize(const matmul_attr_t& attr) const override {
+    int block = YMM_FLOAT_BLOCK;
+    if (platform::MayIUse(platform::avx512f)) {
+      block = ZMM_FLOAT_BLOCK;
+    }
+    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const matmul_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.m, 0);
+    PADDLE_ENFORCE_GT(attr.n, 0);
+    PADDLE_ENFORCE_GT(attr.k, 0);
+    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>  // for malloc and free
+#include <string>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class MatMulJitCode : public JitCode {
+ public:
+  explicit MatMulJitCode(const matmul_attr_t& attr,
+                         size_t code_size = 256 * 1024,
+                         void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
+    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "MatMulJitCode";
+    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
+           std::to_string(k_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ private:
+  int m_, n_, k_;
+
+  reg64_t param_x{abi_param1};
+  reg64_t param_y{abi_param2};
+  reg64_t param_z{abi_param3};
+  reg64_t param_attr{abi_param4};
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_ptr_wgt{r10};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -16,6 +16,8 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <vector>
+#include "paddle/fluid/platform/cpu_info.h"
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
+  int block;
+  int max_num_regs;
+  if (platform::MayIUse(platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    max_num_regs = 32;
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    max_num_regs = 16;
+  }
+  // one for x, one for y, others for z
+  const int max_used_regs_for_n = max_num_regs - 2;
+  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const int num_block = aligned_n / block;
+  const int num_groups = num_block / max_used_regs_for_n;
+  std::vector<int> groups(num_groups, max_used_regs_for_n);
+  int rest_num_regs = num_block % max_used_regs_for_n;
+  if (rest_num_regs != 0) {
+    groups.push_back(rest_num_regs);
+  }
+  if (block_out) {
+    *block_out = block;
+  }
+  if (rest_out) {
+    *rest_out = n % block;
+  }
+  return groups;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@
 
 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator {
   virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
 };
 
+// unify the method of packed groups
+// output the packed groups which used in weights, the block size and rest size
+std::vector<int> packed_groups(int n, int k, int* block = nullptr,
+                               int* rest = nullptr);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 5dbe22a81b4866bdf60a03710d8ffd0b7bcb597b..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/operators/jit/helper.h"
 #include <algorithm>  // tolower
+#include <numeric>
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -49,6 +51,9 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kNCHW16CMulNC);
     ONE_CASE(kSeqPool);
     ONE_CASE(kMatMul);
+    ONE_CASE(kHMax);
+    ONE_CASE(kHSum);
+    ONE_CASE(kSoftmax);
     default:
       PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
       return "NOT JITKernel";
@@ -88,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) {
   return kNone;
 }
 
+template <>
+void pack_weights<float>(const float* src, float* dst, int n, int k) {
+  int block, rest;
+  const auto groups = packed_groups(n, k, &block, &rest);
+  std::for_each(groups.begin(), groups.end(), [&](int i) {
+    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+  });
+  int sum = std::accumulate(groups.begin(), groups.end(), 0);
+  std::memset(dst, 0, k * sum * block * sizeof(float));
+  PADDLE_ENFORCE_GE(sum * block, n,
+                    "The packed n should be equal to or larger than n");
+
+  const int block_len = sizeof(float) * block;
+  int n_offset = 0;
+
+  for (size_t g = 0; g < groups.size(); ++g) {
+    const float* from = src + n_offset;
+    for (int j = 0; j < k; ++j) {
+      size_t copy_sz = groups[g] * block_len;
+      if (g == groups.size() - 1 && rest != 0) {
+        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
+      }
+      std::memcpy(dst, from + j * n, copy_sz);
+      dst += groups[g] * block;
+    }
+    n_offset += groups[g] * block;
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
+    const T* src, T* dst, int n, int k) {
+  PADDLE_THROW("Only support pack with float type.");
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index fbf34fc4b3db49596b6be0360c00e77c12fab9b8..bba3a13619619b6de3f797a4efc4a0d09c3b281f 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -118,6 +118,28 @@ typename KernelTuples::func_type Get(
   return GetRefer<KT, KernelTuples>();
 }
 
+template <KernelType KT, typename KernelTuples>
+class KernelFuncsCache {
+ public:
+  KernelFuncsCache() = default;
+  static KernelFuncsCache& Instance() {
+    static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
+    return g_func_cache;
+  }
+
+  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
+
+  typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
+
+  void Insert(int key, typename KernelTuples::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
+ private:
+  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
+};
+
 const char* to_string(KernelType kt);
 const char* to_string(SeqPoolType kt);
 
@@ -130,17 +152,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
      << (attr.use_peephole ? "True" : "False") << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
   os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
      << "],act_cand[" << to_string(attr.act_cand) << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
   os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
      << to_string(attr.type) << "]";
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
+  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
+  return os;
+}
+
+// expose the method to pack matmul weight
+template <typename T>
+void pack_weights(const T* src, T* dst, int n, int k);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index adb101bd5cdf231ac330dbf44beb4c24c1fcf29e..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+// TODO(TJ): reorder by alphabet
 typedef enum {
   kNone = 0,
   kVMul = 1,
@@ -44,6 +45,9 @@ typedef enum {
   kNCHW16CMulNC,
   kSeqPool,
   kMatMul,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kSoftmax,
 } KernelType;
 
 typedef enum {
@@ -70,6 +74,10 @@ struct XYNTuples {
   typedef void (*func_type)(const T*, T*, int);
 };
 
+// x, return and int
+template <typename T>
+struct XRNTuples : public XYNTuples<T> {};
+
 typedef struct {
   void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
   const void* ct_1;
@@ -137,11 +145,19 @@ struct SeqPoolTuples {
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 
+typedef struct matmul_attr_s {
+  int m, n, k;
+  void* packed_weight{nullptr};
+  matmul_attr_s() = default;
+  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
+      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
+} matmul_attr_t;
+
 template <typename T>
 struct MatMulTuples {
   typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int, int);
+  typedef matmul_attr_t attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };
 
 template <typename T>
@@ -159,6 +175,13 @@ struct LayerNormTuples {
                             const float, int);
 };
 
+template <typename T>
+struct SoftmaxTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int, int);
+};
+
 // nChw16c = nChw16c .* NC
 template <typename T>
 struct NCHW16CMulNCTuples {
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
   return (key << pool_type_shift) + static_cast<int>(attr.type);
 }
 
+template <>
+size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+  size_t key = attr.m;
+  constexpr int shift = 21;
+  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
index e05f204b1eebd03c7a00157d96d0482f4a44a7fb..dd039d29152961210958470a48f086a133ab640c 100644
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix)
 USE_JITKERNEL_MORE(kGRUH1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE(kSoftmax, mix)
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index df0a85256b1f546d5f64be73925cf58b87a25bd7..0f42ac158ca7926981df55936cb903d5f4ae4806 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -48,6 +48,65 @@ void VTanh(const T* x, T* y, int n) {
   compute_addbias(&b, y, y, n);
 }
 
+void Softmax(const T* x, T* y, int n, int bs) {
+  typename XRNTuples<T>::func_type compute_hmax{nullptr};
+  typename XRNTuples<T>::func_type compute_hsum{nullptr};
+  typename AXYNTuples<T>::func_type compute_vscal{nullptr};
+  typename AXYNTuples<T>::func_type compute_vaddbias{nullptr};
+  typename XYNTuples<T>::func_type compute_vexp{nullptr};
+
+  if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax);
+  } else {
+    compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
+  } else {
+    compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
+                                                               compute_vscal);
+  } else {
+    compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
+        n, compute_vaddbias);
+  } else {
+    compute_vaddbias =
+        KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
+    compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
+  } else {
+    compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
+  }
+
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    compute_hmax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    compute_vaddbias(&scalar, x, y, n);  // x - max
+    compute_vexp(y, y, n);
+    compute_hsum(y, &scalar, n);
+    scalar = static_cast<T>(1) / scalar;
+    compute_vscal(&scalar, y, y, n);
+    x += n;
+    y += n;
+  }
+}
+
 void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   if (type == kVSigmoid) {
     return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
@@ -184,6 +243,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; }
 
 bool VTanhKernel::UseMe(const int& d) const { return true; }
 
+bool SoftmaxKernel::UseMe(const int& d) const { return true; }
+
 bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
 
 bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
@@ -207,6 +268,7 @@ namespace mix = paddle::operators::jit::more::mix;
 
 REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MORE_KERNEL(kVTanh, VTanh);
+REGISTER_MORE_KERNEL(kSoftmax, Softmax);
 REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
 REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
 REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
index a70ecdf9348f511311307b4c27bb4506222a7439..d64af192197a0b339a39a1862c028875da2f3900 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -26,6 +26,7 @@ using T = float;
 
 void VSigmoid(const T* x, T* y, int n);
 void VTanh(const T* x, T* y, int n);
+void Softmax(const T* x, T* y, int n, int bs);
 
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
 void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
@@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
 DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MORE_KERNEL(VTanh, XYNTuples);
 
+// XRN
+DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
+
 DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
 DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
 
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index 667c6dfad6676d00ab994564bff57c90caa0cb41..f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
+USE_JITKERNEL_MORE(kSoftmax, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index fccdc68f5efa34bac6f5a34a41569d2f77416284..c7d0215eda9d1e14fcad16da7b70f45824789266 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -25,17 +25,19 @@ namespace more {
 namespace mkl {
 
 template <>
-void MatMul<float>(const float* a, const float* b, float* c, int m, int n,
-                   int k) {
-  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.f, a, k, b, n, 0.f, c, n);
+void MatMul<float>(const float* a, const float* b, float* c,
+                   const matmul_attr_t* attr) {
+  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
+                                 attr->n, 0.f, c, attr->n);
 }
 
 template <>
-void MatMul<double>(const double* a, const double* b, double* c, int m, int n,
-                    int k) {
-  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.0, a, k, b, n, 0.0, c, n);
+void MatMul<double>(const double* a, const double* b, double* c,
+                    const matmul_attr_t* attr) {
+  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
+                                 attr->n, 0.0, c, attr->n);
 }
 
 template <>
@@ -116,12 +118,17 @@ void VAXPY<double>(double a, const double* x, double* y, int n) {
   platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
 }
 
-// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
-bool MatMulKernel<float>::UseMe(const int& d) const {
-  return platform::MayIUse(platform::avx);
+void ASum<float>(const float* x, float* res, int n) {
+  res[0] = platform::dynload::cblas_sasum(n, x, 1);
 }
 
+template <>
+void ASum<double>(const double* x, double* res, int n) {
+  res[0] = platform::dynload::cblas_dasum(n, x, 1);
+}
+
+// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool VMulKernel<float>::UseMe(const int& d) const {
   return platform::MayIUse(platform::avx512f) && d > 512;
@@ -167,13 +174,28 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
+  return platform::MayIUse(platform::avx);
+}
+
+template <>
+bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool SoftmaxKernel<float>::UseMe(const int& d) const {
+  // tuned on avx2
+  return platform::MayIUse(platform::avx) && d < 60;
+}
+
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
   template <>                                            \
   bool func##Kernel<double>::UseMe(const int& d) const { \
     return true;                                         \
   }
 
-AWALYS_USE_ME_WITH_DOUBLE(MatMul);
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
@@ -181,6 +203,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 AWALYS_USE_ME_WITH_DOUBLE(VSquare);
+AWALYS_USE_ME_WITH_DOUBLE(Softmax);
 
 #undef AWALYS_USE_ME_WITH_DOUBLE
 }  // namespace mkl
@@ -204,5 +227,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kSoftmax, Softmax);
 
 #undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index a27196fa19f1d3e9aa6c414b6b9f99a21ef49025..8130b87326f1887f232022ab30fa7bf42b0723e7 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -16,6 +16,7 @@
 
 #include <cmath>
 #include <type_traits>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
@@ -25,7 +26,7 @@ namespace more {
 namespace mkl {
 
 template <typename T>
-void MatMul(const T* a, const T* b, T* c, int m, int n, int k);
+void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
 
 template <typename T>
 void VMul(const T* x, const T* y, T* z, int n);
@@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
   }
 }
 
+template <typename T>
+void ASum(const T* x, T* res, int n);
+
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs) {
+  std::vector<T> entities(bs);
+  for (int i = 0; i < bs; ++i) {
+    entities[i] = x[i * n];
+    for (int c = 1; c < n; ++c) {
+      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+    }
+    for (int c = 0; c < n; ++c) {
+      y[i * n + c] = x[i * n + c] - entities[i];
+    }
+  }
+  VExp(y, y, n * bs);
+  for (int i = 0; i < bs; ++i) {
+    T sum;
+    ASum(&y[i * n], &sum, n);
+    sum = static_cast<T>(1) / sum;
+    VScal(&sum, &y[i * n], &y[i * n], n);
+  }
+}
+
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
   template <typename T>                                              \
   class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
 
+DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
+
 #undef DECLARE_MKL_KERNEL
 
 }  // namespace mkl
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index 4b9bc5e8d49c62404d5d4ef99b7c50987fcb415a..9f2935828ca300dbdb71b0fefb6b9883cb45e4b0 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC)
 USE_JITKERNEL_REFER(kSeqPool)
 USE_JITKERNEL_REFER(kMatMul)
 USE_JITKERNEL_REFER(kVSquare)
+USE_JITKERNEL_REFER(kHSum)
+USE_JITKERNEL_REFER(kHMax)
+USE_JITKERNEL_REFER(kSoftmax)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index 3512ad7fe7921381afb6152330fff6be34de5ad7..b8adb40ec7e1b64df2b04a3201292db235af7b19 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
 
 REGISTER_REFER_KERNEL(kMatMul, MatMul);
 
+REGISTER_REFER_KERNEL(kHMax, HMax);
+REGISTER_REFER_KERNEL(kHSum, HSum);
+
+REGISTER_REFER_KERNEL(kSoftmax, Softmax);
+
 #undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 97d029358594d757f0e1874e9c87ecb8f97c9d50..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -363,21 +363,57 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 
 // A(M,K) * B(K,N) = C(M,N)
 template <typename T>
-void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
+void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
+  int M = attr->m;
+  int N = attr->n;
+  int K = attr->k;
   for (int m = 0; m < M; ++m) {
     const T* pa = A + m * K;
     T* pc = C + m * N;
     for (int n = 0; n < N; ++n) {
       const T* pb = B + n;
-      T sum = static_cast<T>(0);
-      for (int k = 0; k < K; ++k) {
-        sum += (pa[k] * pb[k * N]);
+      pc[n] = pa[0] * pb[0];
+      for (int k = 1; k < K; ++k) {
+        pc[n] += pa[k] * pb[k * N];
       }
-      *(pc + n) = sum;
     }
   }
 }
 
+template <typename T>
+void HMax(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] = res[0] < x[i] ? x[i] : res[0];
+  }
+}
+
+template <typename T>
+void HSum(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] += x[i];
+  }
+}
+
+// y = e^(x - max(x))
+// y = y / sum(y)
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs = 1) {
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    HMax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    VAddBias(&scalar, x, y, n);  // x - max
+    VExp(y, y, n);
+    HSum(y, &scalar, n);
+    scalar = static_cast<T>(1) / scalar;
+    VScal(&scalar, y, y, n);
+    x += n;
+    y += n;
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name, tuples)             \
   template <typename T>                                \
   class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -421,6 +457,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
 
 DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
 
+DECLARE_REFER_KERNEL(HMax, XRNTuples);
+DECLARE_REFER_KERNEL(HSum, XRNTuples);
+
+DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
+
 #undef DECLARE_REFER_KERNEL
 
 }  // namespace refer
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 68a79b6314e4cf86f5b715b9c6694924126b12da..237e588d35cc3b33658a830db34676967818aab6 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-static double acc = 1e-5;
+DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
 
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
@@ -39,7 +39,7 @@ template <typename T>
 void ExpectEQ(const T* target, const T* refer, int n) {
   if (std::is_floating_point<T>::value) {
     for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], acc);
+      EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
     }
   } else {
     for (int i = 0; i < n; ++i) {
@@ -61,6 +61,7 @@ std::vector<int> TestSizes() {
 }
 
 namespace jit = paddle::operators::jit;
+using CPUPlace = paddle::platform::CPUPlace;
 
 template <typename KernelTuples, typename... Args>
 struct TestFuncWithRefer {
@@ -121,6 +122,40 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
+                         int, int> {
+  void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref, int n,
+                  int bs) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(yref.size(), x.size());
+    EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    std::vector<T> ytgt(n * bs);
+    T* ytgt_data = ytgt.data();
+    // test normal
+    tgt(x_data, ytgt_data, n, bs);
+    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+    // test inplace x
+    std::copy(x.begin(), x.end(), ytgt.begin());
+    tgt(ytgt_data, ytgt_data, n, bs);
+    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
+  void operator()(const typename jit::XRNTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const T ref_res) {
+    EXPECT_TRUE(tgt != nullptr);
+    T tgt_res;
+    tgt(x.data(), &tgt_res, x.size());
+    ExpectEQ<T>(&tgt_res, &ref_res, 1);
+  }
+};
+
 template <typename T>
 struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
   void operator()(const typename jit::XYNTuples<T>::func_type tgt,
@@ -172,7 +207,7 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
     T* ht_data = ht.data();
     T* checked_data = checked.data();
 
-    paddle::operators::jit::lstm_t step;
+    jit::lstm_t step;
     step.gates = x_data;
     step.ct_1 = ct_1_data;
     step.ct = ct_data;
@@ -208,7 +243,7 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
     const T* ht_ref_data = ht_ref.data();
     T* x_data = x.data();
     T* ht_data = ht.data();
-    paddle::operators::jit::gru_t step;
+    jit::gru_t step;
     step.gates = x_data;
     step.ht_1 = ht_1_data;
     step.ht = ht_data;
@@ -237,26 +272,28 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
 
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, int, int, int> {
+                         std::vector<T>,
+                         typename jit::MatMulTuples<T>::attr_type> {
   void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
                   const std::vector<T>& a, const std::vector<T>& b,
-                  const std::vector<T>& cref, int m, int n, int k) {
+                  const std::vector<T>& cref,
+                  const typename jit::MatMulTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(a.size(), static_cast<size_t>(m * k));
-    EXPECT_EQ(b.size(), static_cast<size_t>(k * n));
-    EXPECT_EQ(cref.size(), static_cast<size_t>(m * n));
+    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
+    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
+    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
     std::vector<T> c(cref.size());
     const T* a_data = a.data();
     const T* b_data = b.data();
     const T* cref_data = cref.data();
     T* c_data = c.data();
-    tgt(a_data, b_data, c_data, m, n, k);
-    ExpectEQ<T>(c_data, cref_data, m * n);
+    tgt(a_data, b_data, c_data, &attr);
+    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
   }
 };
 
-template <paddle::operators::jit::KernelType KT, typename KernelTuples,
-          typename PlaceType, typename... Args>
+template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+          typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   TestFuncWithRefer<KernelTuples, Args...> test;
   // test jitcode
@@ -286,9 +323,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   test(tgt, args...);
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXYZNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
@@ -320,9 +356,8 @@ void TestXYZNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestAXYNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
@@ -347,9 +382,26 @@ void TestAXYNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestXRNKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  auto last_acc = FLAGS_acc;
+  FLAGS_acc = 1e-4;
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+    std::vector<T> x(d);
+    RandomVec<T>(d, x.data(), -2.f, 2.f);
+    T ref_res;
+    ref(x.data(), &ref_res, d);
+    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
+                                                                      ref_res);
+  }
+  FLAGS_acc = last_acc;
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXYNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
@@ -373,9 +425,8 @@ void TestXYNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestLSTMKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   for (int d : TestSizes()) {
@@ -424,9 +475,8 @@ void TestLSTMKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestGRUKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   for (int d : TestSizes()) {
@@ -459,7 +509,7 @@ void TestGRUKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestSeqPoolKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<jit::SeqPoolType> pool_types = {
@@ -484,12 +534,13 @@ void TestSeqPoolKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = acc;
-  // TODO(intel): this should be acc issue of MKL
-  acc = 1e-3;
+  auto last_acc = FLAGS_acc;
+  // TODO(intel): fix MKL acc issue
+  // https://github.com/PaddlePaddle/Paddle/issues/15447
+  FLAGS_acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
@@ -501,16 +552,42 @@ void TestMatMulKernel() {
         const T* a_data = a.data();
         const T* b_data = b.data();
         T* c_data = c.data();
-        ref(a_data, b_data, c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        ref(a_data, b_data, c_data, &attr);
         TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(k, a, b, c, m, n, k);
+                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
       }
     }
   }
-  acc = last_acc;
+  FLAGS_acc = last_acc;
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestSoftmaxKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int bs : {1, 2, 10}) {
+    for (int n : TestSizes()) {
+      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> x(bs * n), y(bs * n);
+      RandomVec<T>(bs * n, x.data(), -2.f, 2.f);
+      const T* x_data = x.data();
+      T* y_data = y.data();
+
+      std::vector<T> xinp(x.size());  // inplace test
+      std::copy(x.begin(), x.end(), xinp.begin());
+      ref(x_data, y_data, n, bs);
+      T* xinp_data = xinp.data();
+      ref(xinp_data, xinp_data, n, bs);
+      ExpectEQ<T>(xinp_data, y_data, n * bs);
+
+      TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
+                   std::vector<T>>(n, x, y, n, bs);
+    }
+  }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestNCHW16CMulNCKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   const int n = 3, c = 16 * 4, h = 10, w = 10;
@@ -565,129 +642,123 @@ void TestNCHW16CMulNCKernel() {
 
 // XYZNTuple
 TEST(JITKernel, kVMul) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVMul, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVMul, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVMul, float, CPUPlace>();
+  TestXYZNKernel<jit::kVMul, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAdd) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVAdd, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVAdd, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAdd, float, CPUPlace>();
+  TestXYZNKernel<jit::kVAdd, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAddRelu) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVAddRelu, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVAddRelu, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAddRelu, float, CPUPlace>();
+  TestXYZNKernel<jit::kVAddRelu, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSub) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVSub, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVSub, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVSub, float, CPUPlace>();
+  TestXYZNKernel<jit::kVSub, double, CPUPlace>();
 }
 
 // AXYNTuples
 TEST(JITKernel, kVScal) {
-  namespace jit = paddle::operators::jit;
-  TestAXYNKernel<jit::kVScal, float, paddle::platform::CPUPlace>();
-  TestAXYNKernel<jit::kVScal, double, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVScal, float, CPUPlace>();
+  TestAXYNKernel<jit::kVScal, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAddBias) {
-  namespace jit = paddle::operators::jit;
-  TestAXYNKernel<jit::kVAddBias, float, paddle::platform::CPUPlace>();
-  TestAXYNKernel<jit::kVAddBias, double, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVAddBias, float, CPUPlace>();
+  TestAXYNKernel<jit::kVAddBias, double, CPUPlace>();
+}
+
+// XRNTuples
+TEST(JITKernel, kHMax) {
+  TestXRNKernel<jit::kHMax, float, CPUPlace>();
+  TestXRNKernel<jit::kHMax, double, CPUPlace>();
+}
+
+TEST(JITKernel, kHSum) {
+  TestXRNKernel<jit::kHSum, float, CPUPlace>();
+  TestXRNKernel<jit::kHSum, double, CPUPlace>();
 }
 
 // XYNTuples
 TEST(JITKernel, kVRelu) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVRelu, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVRelu, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVRelu, float, CPUPlace>();
+  TestXYNKernel<jit::kVRelu, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVIdentity) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVIdentity, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVIdentity, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVIdentity, float, CPUPlace>();
+  TestXYNKernel<jit::kVIdentity, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSquare) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVSquare, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVSquare, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVSquare, float, CPUPlace>();
+  TestXYNKernel<jit::kVSquare, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVExp) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVExp, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVExp, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVExp, float, CPUPlace>();
+  TestXYNKernel<jit::kVExp, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSigmoid) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVSigmoid, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVSigmoid, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVSigmoid, float, CPUPlace>();
+  TestXYNKernel<jit::kVSigmoid, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVTanh) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVTanh, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVTanh, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVTanh, float, CPUPlace>();
+  TestXYNKernel<jit::kVTanh, double, CPUPlace>();
 }
 
 // LSTM
 TEST(JITKernel, kLSTMCtHt) {
-  namespace jit = paddle::operators::jit;
-  TestLSTMKernel<jit::kLSTMCtHt, float, paddle::platform::CPUPlace>();
-  TestLSTMKernel<jit::kLSTMCtHt, double, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMCtHt, float, CPUPlace>();
+  TestLSTMKernel<jit::kLSTMCtHt, double, CPUPlace>();
 }
 
 TEST(JITKernel, kLSTMC1H1) {
-  namespace jit = paddle::operators::jit;
-  TestLSTMKernel<jit::kLSTMC1H1, float, paddle::platform::CPUPlace>();
-  TestLSTMKernel<jit::kLSTMC1H1, double, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMC1H1, float, CPUPlace>();
+  TestLSTMKernel<jit::kLSTMC1H1, double, CPUPlace>();
 }
 
 // GRU
 TEST(JITKernel, kGRUH1) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUH1, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUH1, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUH1, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUH1, double, CPUPlace>();
 }
 
 TEST(JITKernel, kGRUHtPart1) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUHtPart1, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart1, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart1, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart1, double, CPUPlace>();
 }
 
 TEST(JITKernel, kGRUHtPart2) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUHtPart2, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart2, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart2, double, CPUPlace>();
 }
 
 TEST(JITKernel, kSeqPool) {
-  namespace jit = paddle::operators::jit;
-  TestSeqPoolKernel<jit::kSeqPool, float, paddle::platform::CPUPlace>();
-  TestSeqPoolKernel<jit::kSeqPool, double, paddle::platform::CPUPlace>();
+  TestSeqPoolKernel<jit::kSeqPool, float, CPUPlace>();
+  TestSeqPoolKernel<jit::kSeqPool, double, CPUPlace>();
 }
 
 TEST(JITKernel, kMatMul) {
-  namespace jit = paddle::operators::jit;
-  TestMatMulKernel<jit::kMatMul, float, paddle::platform::CPUPlace>();
-  TestMatMulKernel<jit::kMatMul, double, paddle::platform::CPUPlace>();
+  TestMatMulKernel<jit::kMatMul, float, CPUPlace>();
+  TestMatMulKernel<jit::kMatMul, double, CPUPlace>();
+}
+
+TEST(JITKernel, kSoftmax) {
+  TestSoftmaxKernel<jit::kSoftmax, float, CPUPlace>();
+  TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
 }
 
 TEST(JITKernel, kNCHW16CMulNC) {
-  namespace jit = paddle::operators::jit;
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float,
-                         paddle::platform::CPUPlace>();
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double,
-                         paddle::platform::CPUPlace>();
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
 }
 
 // TODO(yihua/TJ): add crf decoding and layer norm unit tests
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index fd15539f7b6727496988c9b13d0d2551659a420a..0af8b9e69cfe09890f28ef2028baa19319a5c379 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -193,8 +194,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                        ops::LookupTableCUDAKernel<double>);
+                        ops::LookupTableCUDAKernel<double>,
+                        ops::LookupTableCUDAKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
                         ops::LookupTableGradCUDAKernel<float>,
-                        ops::LookupTableGradCUDAKernel<double>);
+                        ops::LookupTableGradCUDAKernel<double>,
+                        ops::LookupTableGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index dc27e543f0dfd65e556f9e3a138778972ad6982f..e20524012a5839fd250b7426a5efc42b7e87fe87 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -53,7 +53,8 @@ math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
-math_library(softmax DEPS math_function)
+math_library(softmax DEPS math_function jit_kernel_helper)
+math_library(beam_search DEPS math_function)
 
 math_library(matrix_bit_code)
 
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
+cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69971ef7423eff6bc3f8543a491edb6b0bbd00ca
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -0,0 +1,287 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <algorithm>
+#include <map>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::LoDTensor *pre_ids,
+                  const framework::LoDTensor *pre_scores,
+                  const framework::LoDTensor *ids,
+                  const framework::LoDTensor *scores,
+                  framework::LoDTensor *selected_ids,
+                  framework::LoDTensor *selected_scores,
+                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto &high_level = abs_lod[level];
+
+    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
+                                        beam_size, end_id, is_accumulated);
+    auto selected_items = ToMap(items, high_level.back());
+    if (FLAGS_v == 3) {
+      VLOG(3) << "selected_items:";
+      for (size_t i = 0; i < selected_items.size(); ++i) {
+        VLOG(3) << "offset: " << i;
+        for (auto &item : selected_items[i]) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
+    // calculate the output tensor's height
+    size_t num_instances = std::accumulate(
+        std::begin(selected_items), std::end(selected_items), 0,
+        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+    // the output tensor shape should be [num_instances, 1]
+    auto dims = framework::make_ddim(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+    selected_ids->Resize(dims);
+    selected_scores->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
+
+    auto *selected_ids_data =
+        selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+    auto *selected_scores_data =
+        selected_scores->mutable_data<float>(platform::CPUPlace());
+    auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace());
+
+    // fill in data
+    std::vector<size_t> low_level;
+    size_t low_offset = 0;
+    for (auto &items : selected_items) {
+      low_level.push_back(low_offset);
+      for (auto &item : items) {
+        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
+        selected_ids_data[low_offset] = item.id;
+        selected_scores_data[low_offset] = item.score;
+        low_offset++;
+      }
+    }
+    low_level.push_back(low_offset);
+
+    // fill lod
+    framework::LoD lod(2);
+    lod[0].assign(high_level.begin(), high_level.end());
+    lod[1].assign(low_level.begin(), low_level.end());
+    if (!framework::CheckLoD(lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    }
+    selected_ids->set_lod(lod);
+    selected_scores->set_lod(lod);
+  }
+
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    size_t id;
+    // the corresponding score
+    float score;
+
+    inline bool operator<(const Item &in) const {
+      return (score < in.score) ||
+             ((score == in.score) && (offset < in.offset));
+    }
+
+    inline void operator=(const Item &in) {
+      offset = in.offset;
+      id = in.id;
+      score = in.score;
+    }
+
+    std::string ToString() {
+      std::ostringstream os;
+      os << "{";
+      os << "offset: " << offset << ", ";
+      os << "id: " << id << ", ";
+      os << "score: " << score << "";
+      os << "}";
+      return os.str();
+    }
+  };
+
+ protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
+   */
+  void PruneEndBeams(const framework::LoDTensor *pre_ids,
+                     const framework::LoD &abs_lod,
+                     std::vector<std::vector<Item>> *items, size_t lod_level,
+                     int end_id) {
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto &high_level = abs_lod[lod_level];
+    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+      size_t src_prefix_start = high_level[src_idx];
+      size_t src_prefix_end = high_level[src_idx + 1];
+      bool finish_flag = true;
+      for (size_t offset = src_prefix_start; offset < src_prefix_end;
+           offset++) {
+        for (auto &item : items->at(offset)) {
+          if (item.id != static_cast<size_t>(end_id) ||
+              pre_ids_data[offset] != end_id) {
+            finish_flag = false;
+            break;
+          }
+        }
+        if (!finish_flag) break;
+      }
+      if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                          // prune this beam
+        for (size_t offset = src_prefix_start; offset < src_prefix_end;
+             offset++)
+          items->at(offset).clear();
+      }
+    }
+  }
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance.
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>> &items, size_t element_num) {
+    std::vector<std::vector<Item>> result;
+    result.resize(element_num);
+    for (auto &entries : items) {
+      for (const auto &item : entries) {
+        result[item.offset].push_back(item);
+      }
+    }
+    return result;
+  }
+
+  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
+              size_t beam_size) {
+    std::vector<Item> &top_beam = *top_beam_ptr;
+
+    size_t num_beams = top_beam.size();
+    if (num_beams < beam_size) {
+      top_beam.resize(num_beams + 1);
+      num_beams++;
+    } else {
+      if (item < top_beam[beam_size - 1]) {
+        return;
+      }
+    }
+
+    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
+      if (top_beam[k] < item) {
+        top_beam[k + 1] = top_beam[k];
+      } else {
+        top_beam[k + 1] = item;
+        return;
+      }
+    }
+    top_beam[0] = item;
+  }
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor *pre_ids,
+      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
+      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
+      int end_id, bool is_accumulated) {
+    std::vector<std::vector<Item>> result;
+
+    // find the current candidates
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto *pre_scores_data = pre_scores->data<float>();
+
+    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
+    auto *scores_data = scores->data<float>();
+
+    size_t num_seqs = scores->NumElements(lod_level);
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
+      size_t seq_offset_start = abs_lod[lod_level][seq_id];
+      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
+
+      std::vector<Item> top_beam;
+      top_beam.reserve(beam_size);
+
+      for (size_t offset = seq_offset_start; offset < seq_offset_end;
+           ++offset) {
+        auto pre_id = pre_ids_data[offset];
+        auto pre_score = pre_scores_data[offset];
+        if (pre_id == end_id) {
+          // Allocate all probability mass to end_id for finished branchs and
+          // the other candidate ids can be ignored.
+          Item item(offset, end_id, pre_score);
+          Insert(&top_beam, item, beam_size);
+        } else {
+          size_t index = offset * seq_width;
+          for (size_t d = 0; d < seq_width; d++, index++) {
+            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
+            float score = is_accumulated
+                              ? scores_data[index]
+                              : pre_score + std::log(scores_data[index]);
+            Item item(offset, id, score);
+            Insert(&top_beam, item, beam_size);
+          }
+        }
+      }
+
+      result.emplace_back(top_beam);
+    }
+
+    if (FLAGS_v == 3) {
+      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+      for (auto &items : result) {
+        VLOG(3) << "item set:";
+        for (auto &item : items) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    return result;
+  }
+};
+
+template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
new file mode 100644
index 0000000000000000000000000000000000000000..61d021ef627f1ccd90b992c2078a7f3ca879422d
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -0,0 +1,399 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+struct Triple {
+  __device__ __forceinline__ Triple() {}
+  __device__ __forceinline__ Triple(int o, int i, float s)
+      : offset(o), id(i), score(s) {}
+
+  __device__ __forceinline__ void set(int o, int i, float s) {
+    offset = o;
+    id = i;
+    score = s;
+  }
+
+  __device__ __forceinline__ void operator=(const Triple& in) {
+    offset = in.offset;
+    id = in.id;
+    score = in.score;
+  }
+
+  __device__ __forceinline__ bool operator<(const float s) const {
+    return score < s;
+  }
+
+  __device__ __forceinline__ bool operator<(const Triple& in) const {
+    return (score < in.score) || ((score == in.score) && (offset < in.offset));
+  }
+
+  int offset;
+  int id;
+  float score;
+};
+
+__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
+                                       int beam_size) {
+  if (p < top_beam[beam_size - 1]) {
+    return;
+  }
+  for (int k = beam_size - 2; k >= 0; --k) {
+    if (top_beam[k] < p) {
+      top_beam[k + 1] = top_beam[k];
+    } else {
+      top_beam[k + 1] = p;
+      return;
+    }
+  }
+  top_beam[0] = p;
+}
+
+template <int MaxThreadsPerSeq, bool IsAccumulated = true>
+__device__ __forceinline__ int SelectTopBeam(
+    Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    int used_threads) {
+  // top_beam is shared memory
+  const int tid = threadIdx.x;
+  const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
+
+  int num_used_threads = used_threads;
+
+  Triple* top_beam_local = top_beam + tid * beam_size;
+  if (tid_of_seq < num_used_threads) {
+    for (int i = 0; i < beam_size; ++i) {
+      top_beam_local[i].set(-1, -1, -INFINITY);
+    }
+
+    for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
+      int pre_id = static_cast<int>(pre_ids[offset]);
+      if (pre_id == end_id) {
+        if (tid_of_seq == 0) {
+          Triple tmp(offset, end_id, pre_scores[offset]);
+          Insert(top_beam_local, tmp, beam_size);
+        }
+      } else {
+        int index = offset * seq_width + tid_of_seq;
+        if (!IsAccumulated) {
+          float pre_score = pre_scores[offset];
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            float score = pre_score + __logf(scores[index]);
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        } else {
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            float score = scores[index];
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        }
+      }
+    }
+  }
+
+  while (num_used_threads > 1) {
+    if (num_used_threads > 16) {
+      __syncthreads();
+    }
+
+    num_used_threads = num_used_threads >> 1;
+    if (tid_of_seq < num_used_threads) {
+      int index_in_sh = (num_used_threads + tid) * beam_size;
+      for (int i = 0; i < beam_size; i++) {
+        Insert(top_beam_local, top_beam[index_in_sh], beam_size);
+        index_in_sh++;
+      }
+    }
+  }
+
+  if (tid_of_seq == 0) {
+    int num_items = 0;
+    for (int i = 0; i < beam_size; ++i) {
+      num_items =
+          (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
+    }
+    return num_items;
+  }
+
+  return 0;
+}
+
+__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
+                                              const int64_t* pre_ids,
+                                              const int end_id, int num_items) {
+  bool finish_flag = true;
+  for (int i = 0; i < num_items; ++i) {
+    int offset = top_beam_local[i].offset;
+    if (top_beam_local[i].id != end_id ||
+        static_cast<int>(pre_ids[offset]) != end_id) {
+      finish_flag = false;
+      break;
+    }
+  }
+  return finish_flag;
+}
+
+__device__ __forceinline__ void WriteBack(
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, Triple* top_beam_local,
+    const int seq_offset_start, const int seq_offset_end,
+    const int selected_seq_start, const int selected_seq_length) {
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  int global_index = selected_seq_start;
+  for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
+       ++global_offset) {
+    for (int local_index = 0; local_index < selected_seq_length;
+         ++local_index) {
+      if (top_beam_local[local_index].offset == global_offset) {
+        selected_ids[global_index] =
+            static_cast<int64_t>(top_beam_local[local_index].id);
+        selected_scores[global_index] = top_beam_local[local_index].score;
+        parent_idx[global_index] = static_cast<int>(global_offset);
+        global_index++;
+      }
+    }
+    selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__device__ void BeamSearchDetails(
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    bool is_accumulated, int num_used_threads) {
+  __shared__ Triple top_beam[MaxLength];
+
+  int num_items = 0;
+  if (is_accumulated) {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  } else {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  }
+
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  const int tid_of_seq = tid % MaxThreadsPerSeq;
+  if (tid_of_seq == 0) {
+    // Use 1 thread for each sequence.
+    Triple* top_beam_local = top_beam + tid * beam_size;
+    bool finish_flag =
+        PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
+
+    int selected_seq_start = 0;
+    int selected_seq_length = finish_flag ? 0 : num_items;
+
+    if (MaxSeqs > 1) {
+      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+      __shared__ int shared_mem[MaxSeqs];
+
+      // [0, MaxSeqs - 1], length of each sequences
+      shared_mem[seq_id] = selected_seq_length;
+      __syncthreads();
+
+      for (int s = 0; s < seq_id; ++s) {
+        selected_seq_start += shared_mem[s];
+      }
+
+      if (seq_id == 0) {
+        selected_offsets[0] = 0;
+      }
+    } else {
+      selected_offsets[0] = 0;
+    }
+
+    WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets,
+              top_beam_local, seq_offset_start, seq_offset_end,
+              selected_seq_start, selected_seq_length);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
+                                 int* parent_idx, size_t* selected_offsets,
+                                 const int64_t* pre_ids,
+                                 const float* pre_scores, const int64_t* ids,
+                                 const float* scores, const size_t* seq_offsets,
+                                 const int num_seqs, const int seq_width,
+                                 int beam_size, int end_id, bool is_accumulated,
+                                 int num_used_threads) {
+  const int tid = threadIdx.x;
+  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+
+  int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
+  int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
+}
+
+template <int MaxLength, int MaxThreadsPerSeq>
+__global__ void BeamSearchKernelSingle(
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_length,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
+  const int seq_offset_start = 0;
+  const int seq_offset_end = seq_length;
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
+}
+
+static inline int GetNumUsedThreads(const int max_threads_per_seq,
+                                    const int seq_width, int beam_size) {
+  int num_used_threads = (seq_width + beam_size - 1) / beam_size;
+  num_used_threads = max_threads_per_seq < num_used_threads
+                         ? max_threads_per_seq
+                         : num_used_threads;
+
+  num_used_threads =
+      num_used_threads > 32
+          ? (num_used_threads >> 5) << 5
+          : (num_used_threads > 16
+                 ? 32
+                 : (num_used_threads > 8
+                        ? 16
+                        : (num_used_threads > 4
+                               ? 8
+                               : (num_used_threads > 2 ? 4
+                                                       : num_used_threads))));
+  return num_used_threads;
+}
+
+template <typename T>
+class BeamSearchFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores,
+                  framework::Tensor* parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    const int64_t* pre_ids_data = pre_ids->data<int64_t>();
+    const float* pre_scores_data = pre_scores->data<float>();
+    const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
+    const float* scores_data = scores->data<float>();
+
+    const size_t num_seqs = abs_lod[level].size() - 1;
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    // Reserve a big enough memory.
+    auto selected_dims =
+        framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
+    int64_t* selected_ids_data =
+        selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
+    float* selected_scores_data =
+        selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+    int* parent_idx_data = parent_idx->mutable_data<int>(
+        {static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace());
+
+    framework::LoD selected_lod(2);
+    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
+    selected_lod[1].resize(scores->dims()[0] + 1);
+    size_t* selected_offsets =
+        selected_lod[1].CUDAMutableData(context.GetPlace());
+
+    if (num_seqs == 1) {
+      const int seq_length = static_cast<int>(abs_lod[level][1]);
+      const int kMaxThreadsPerSeq = 1024;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
+                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_length, static_cast<int>(seq_width),
+                static_cast<int>(beam_size), static_cast<int>(end_id),
+                is_accumulated, num_used_threads));
+      }
+    } else if (num_seqs <= 4) {
+      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      // Use only 1 block
+      const int kMaxThreadsPerSeq = 32;
+      const int kMaxSeqs = 4;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
+                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_offsets, static_cast<int>(num_seqs),
+                static_cast<int>(seq_width), static_cast<int>(beam_size),
+                end_id, is_accumulated, num_used_threads));
+      }
+    } else {
+      LOG(FATAL) << "Not implemented.";
+    }
+
+    context.Wait();
+    if (!framework::CheckLoD(selected_lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+    }
+
+    selected_ids->set_lod(selected_lod);
+    selected_scores->set_lod(selected_lod);
+    if (selected_lod[1].back() < num_seqs * beam_size) {
+      auto final_selected_dims = framework::make_ddim(
+          {static_cast<int64_t>(selected_lod[1].back()), 1});
+      selected_ids->Resize(final_selected_dims);
+      selected_scores->Resize(final_selected_dims);
+      parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
+    }
+  }
+};
+
+template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..4474e7ea52affed792572d02202ec2577c471e50
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ *  Input
+ *
+ *    ids:
+ *      - LoD (should have 2 levels)
+ *        - first level: [0, 1, 4]
+ *        - second level: [0, 1, 2, 3, 4]
+ *      - tensor's data:
+ *          [[4, 2, 5]
+ *           [2, 1, 3]
+ *           [3, 5, 2]
+ *           [8, 2, 1]]
+ *
+ *    scores:
+ *      - LoD same as `ids`
+ *      - tensor's data
+ *          [[0.5, 0.3, 0.2]
+ *           [0.6, 0.3, 0.1]
+ *           [0.9, 0.5, 0.1]
+ *           [0.7, 0.5, 0.1]]
+ *
+ * The inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * Lets assume beam size is 2, and the beam search's output should be
+ *      - LoD
+ *        - first level: [0, 1, 2]
+ *        - second level: [0, 2, 4]
+ *      - id tensor's data
+ *          [[4,
+ *            1,
+ *            3,
+ *            8]]
+ *      - score tensor's data
+ *          [[0.5,
+ *            0.3,
+ *            0.9,
+ *            0.7]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+template <typename DeviceContext, typename T>
+class BeamSearchFunctor {
+ public:
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1],
+   *      [0 1 2]]
+   *   - [[]
+   *      [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(
+      const DeviceContext& context, const framework::LoDTensor* pre_ids,
+      const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
+      const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
+      framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
+      size_t level, size_t beam_size, int end_id, bool is_accumulated);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea8eb8b00db328ca13d3d33d751aca4eac66dae
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
+                       paddle::framework::LoDTensor* scores,
+                       paddle::framework::LoDTensor* pre_ids,
+                       paddle::framework::LoDTensor* pre_scores) {
+  // lod
+  paddle::framework::LoD lod;
+  std::vector<size_t> level0({0, 2, 4});
+  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+
+  auto dims = paddle::framework::make_ddim({4, 3});
+  ids->Resize(dims);
+  scores->Resize(dims);
+
+  paddle::platform::CPUPlace place;
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  std::vector<float> scores_vec_data(
+      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
+
+  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
+  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
+
+  for (int i = 0; i < ids->numel(); i++) {
+    ids_data[i] = ids_vec_data[i];
+    scores_data[i] = scores_vec_data[i];
+  }
+
+  // pre_ids
+  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
+  }
+
+  // pre_scores
+  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
+}
+
+template <typename DeviceContext, typename Place>
+void TestBeamSearch() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    TensorCopySync(cpu_ids, *place, &ids);
+    TensorCopySync(cpu_scores, *place, &scores);
+    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
+                   &cpu_selected_ids);
+    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
+                   &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(BeamSearch, CPU) {
+  TestBeamSearch<paddle::platform::CPUDeviceContext,
+                 paddle::platform::CPUPlace>();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BeamSearch, GPU) {
+  TestBeamSearch<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>();
+}
+#endif
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 2708f3bcd8f1d2cab19c74b57fdf9f903d9dc65d..238d9f2905058d267ffbee0669594920d7a9e031 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+#include <glog/logging.h>
 #include <iostream>
 #include <queue>
 #include <utility>
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
   auto index = (*int_dist_)(*random_engine_);
   auto p = (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
-    return alias_[index];
+    int alias = alias_[index];
+
+    if (alias == exceptional_val) {
+      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
+      return index;
+    }
+
+    return alias;
   } else {
     return index;
   }
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 98e0b898a504e3bd6b37c3cc772c179eab6038a4..3fa5a7ae336a9be984324411b88570aea99c2c78 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
   const float* alias_probs_;
   const int* alias_;
   const float* probs_;
+  const int exceptional_val = -1;
   std::shared_ptr<std::mt19937> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index f15b37a1e3f0ae9c7612c4f74470472393ff4ad6..aedb82da2f0fb2f15e1586d351af7c9d4364852b 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 
   auto* out_data = output->value().data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 73d83fa2e43f14445c969648cd469b0e32d644c7..74892316e6decdeab3a08396fa2f4bdeb8eb7b73 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
 
   auto* out_data = output_cpu.data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 5535523e798912ff80eeb5d753914c7d8d70a05f..cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
-  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
   EXPECT_EQ(in_grad.lod(), lod);
 
   if (paddle::platform::is_cpu_place(*place)) {
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 1d9d98b10646af9e199f6c481740d30745888707..1ff9ff684fc8001afb0f768a033b4c5bd1592702 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 
-#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -81,28 +81,10 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
     const int kBatchDim = 0;
     const int kClassDim = 1;
     // 2D data. Batch x C
-    const int batch_size = in_dims[kBatchDim];
-    const int num_classes = in_dims[kClassDim];
-    std::vector<float> entities(batch_size);
-    auto blas = math::GetBlas<DeviceContext, float>(context);
-    for (int n = 0; n < batch_size; ++n) {
-      entities[n] = in_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] = in_data[n * num_classes + c] > entities[n]
-                          ? in_data[n * num_classes + c]
-                          : entities[n];
-      }
-      for (int c = 0; c < num_classes; ++c) {
-        out_data[n * num_classes + c] =
-            in_data[n * num_classes + c] - entities[n];
-      }
-    }
-
-    blas.VEXP(num_classes * batch_size, out_data, out_data);
-    for (int n = 0; n < batch_size; ++n) {
-      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
-      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
-    }
+    auto compute_softmax =
+        jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>(
+            in_dims[kClassDim]);
+    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
   }
 };
 
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/activation_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/batch_norm_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/concat_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/conv_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/conv_transpose_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
diff --git a/paddle/fluid/operators/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/dequantize_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/fc_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/gaussian_random_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
similarity index 88%
rename from paddle/fluid/operators/lrn_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 4e4f977fcc742856b877ef0b7f9a3cc9879aefce..097ba01d401dbc7969e30f576cac2567c874ed99 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mid->mutable_data<T>(ctx.GetPlace());
 
     const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    // MKL-DNN implements LRN in a caffe way:
+    // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
+    // Where sum of squares is divided by size of normalization window
+    // this is not the case for PaddlePaddle LRN.
+    // Hence we need to compensate for this diffrence by
+    // multipliing alpha by size of window(n)
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dims = paddle::framework::vectorize2int(x->dims());
 
     auto src_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    auto dst_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+        dims, mkldnn::memory::data_type::f32, x->format());
 
     auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                   mkldnn::lrn_across_channels,
@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   k};
 
     auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
-                                     static_cast<void*>(output_data)};
 
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       src_memory->set_data_handle(
           static_cast<void*>(const_cast<T*>(input_data)));
 
+      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
       auto workspace_memory = insert_to_context<mkldnn::memory>(
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
       run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
 
       run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     }
   }
 };
@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_workspace_memory = key + "@lrn_workspace_memory";
 
     const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
 
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
similarity index 100%
rename from paddle/fluid/operators/mkldnn_activation_op.h
rename to paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/pool_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
diff --git a/paddle/fluid/operators/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/quantize_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/softmax_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/sum_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/transpose_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 2c97eef096eb3d23273e362e658cb1b5fc808609..3e48b67a570d41482e358ae3941eb1e2b6ab91f8 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
     PrepareSamples<DeviceContext, T>(context, sampler);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
     const int64_t *sample_labels_data = sample_labels->data<int64_t>();
+
+    for (int x = 0; x < sample_labels->numel(); x++) {
+      PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
+    }
+
     auto sample_out = context.Output<Tensor>("SampleLogits");
     T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
     auto label = context.Input<Tensor>("Label");
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b256ef02666c21ec1db3f6922b56bb23363b4a0
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(WITH_NGRAPH)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
+  op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+endif()
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
similarity index 55%
rename from paddle/fluid/framework/ngraph_bridge.cc
rename to paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 365870c54eb3861ad6c273d3866dcd32d1c4166a..13b168ce4553c3377a62d9781b185fa7303c1136 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -17,39 +17,41 @@ limitations under the License. */
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
-#include "paddle/fluid/framework/ngraph_bridge.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
-namespace framework {
+namespace operators {
 
 namespace NG_OPS = paddle::operators::ngraphs;
 std::map<std::string,
-         std::function<void(const std::shared_ptr<OperatorBase>&,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
     NgraphBridge::NG_NODE_MAP = {
         {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
         {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
-        {"mean", paddle::operators::ngraphs::BuildMeanNode},
-        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
-        {"mul", paddle::operators::ngraphs::BuildMulNode},
-        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
-        {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode},
-        {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode},
-        {"scale", paddle::operators::ngraphs::BuildScaleNode},
-        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
-        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
-
-void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
+        {"fill_constant", NG_OPS::BuildFillConstantNode},
+        {"mean", NG_OPS::BuildMeanNode},
+        {"mean_grad", NG_OPS::BuildMeanGradNode},
+        {"mul", NG_OPS::BuildMulNode},
+        {"mul_grad", NG_OPS::BuildMulGradNode},
+        {"pool2d", NG_OPS::BuildPool2dNode},
+        {"pool2d_grad", NG_OPS::BuildPool2dGradNode},
+        {"softmax", NG_OPS::BuildSoftmaxNode},
+        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
+        {"scale", NG_OPS::BuildScaleNode},
+        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
+        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
+        {"top_k", NG_OPS::BuildTopKNode}};
+
+void NgraphBridge::BuildNgNode(
+    const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
   NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
similarity index 84%
rename from paddle/fluid/framework/ngraph_bridge.h
rename to paddle/fluid/operators/ngraph/ngraph_bridge.h
index 5ad7b8daeb6a782515e50fc87ca7188b46308390..c57988f8f6322e76678c572aa21ff5b17b9e3c22 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -21,16 +21,16 @@ limitations under the License. */
 
 #include "ngraph/node.hpp"
 
-namespace paddle {
-namespace framework {
+#include "paddle/fluid/framework/operator.h"
 
-class OperatorBase;
+namespace paddle {
+namespace operators {
 
 class NgraphBridge {
  public:
   static std::map<
       std::string,
-      std::function<void(const std::shared_ptr<OperatorBase>&,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                          std::shared_ptr<std::unordered_map<
                              std::string, std::shared_ptr<ngraph::Node>>>)>>
       NG_NODE_MAP;
@@ -41,7 +41,7 @@ class NgraphBridge {
           var_node_map)
       : ngb_node_map_(var_node_map) {}
 
-  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 
  private:
   std::shared_ptr<
@@ -49,5 +49,5 @@ class NgraphBridge {
       ngb_node_map_;
 };
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bec4b514a218715134d2366dd7efd7cf5b377b68
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -0,0 +1,491 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+
+namespace paddle {
+namespace operators {
+
+static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
+static std::map<framework::proto::VarType::Type, ngraph::element::Type>
+    pd2ng_type_map = {
+        {framework::proto::VarType::FP32, ngraph::element::f32},
+        {framework::proto::VarType::FP64, ngraph::element::f64},
+        {framework::proto::VarType::INT32, ngraph::element::i32},
+        {framework::proto::VarType::INT64, ngraph::element::i64},
+        {framework::proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphEngine::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+static std::vector<std::vector<int>> NgraphOpIntervals(
+    framework::BlockDesc* block) {
+  std::vector<std::vector<int>> intervals;
+  auto ops = block->AllOps();
+  int size = ops.size();
+  int left = 0;
+  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
+    ++left;
+  }
+
+  int right = left;
+  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  int pivot = left;
+  while (pivot < right) {
+    auto op_type = ops.at(pivot)->Type();
+    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      int start = pivot, end = start;
+      while (pivot < right &&
+             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
+              NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<int> interval = {start, end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+  return intervals;
+}
+
+static void SubstituteNgraphOp(framework::BlockDesc* block,
+                               std::string block_str,
+                               std::vector<int> interval) {
+  framework::ProgramDesc program;
+  block->RemoveOp(interval.at(0), interval.at(1));
+  auto* ng_op = block->InsertOp(interval.at(0));
+  ng_op->SetType("ngraph_engine");
+  ng_op->SetAttr("interval", interval);
+  ng_op->SetAttr("graph", block_str);
+}
+
+// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
+void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(4) << "use_ngraph=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    // TODO(baojun-nervana): Remove the const_cast
+    auto* block =
+        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
+    std::string block_str = block->Proto()->SerializeAsString();
+    auto intervals = NgraphOpIntervals(block);
+    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+      SubstituteNgraphOp(block, block_str, *it);
+    }
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
+NgraphEngine::NgraphEngine(const framework::Scope& scope,
+                           const platform::Place& place,
+                           const std::string& serialized_graph,
+                           const std::vector<int>& interval)
+    : scope_(scope), place_(place) {
+  var_in_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  var_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
+                    serialized_graph;
+
+  framework::proto::BlockDesc bdesc;
+  bdesc.ParseFromString(serialized_graph);
+  framework::BlockDesc block(nullptr, &bdesc);
+
+  Prepare(block, interval);
+
+  BuildNgIO();
+
+  GetNgFunction();
+}
+
+void NgraphEngine::Prepare(const framework::BlockDesc& block,
+                           const std::vector<int>& interval) {
+  for (auto& var : block.AllVars()) {
+    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != framework::kFeedOpType &&
+        var_name != framework::kFetchOpType) {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  auto ops_desc = block.AllOps();
+  int idx = interval[0];
+  while (idx < interval[1]) {
+    auto op_desc = ops_desc.at(idx);
+    auto op = framework::OpRegistry::CreateOp(*op_desc);
+    fused_ops_.push_back(std::move(op));
+    ++idx;
+  }
+
+  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+    auto op_desc = ops_desc.at(idx);
+    for (auto& var_name_item : op_desc->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+    ++idx;
+  }
+
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
+    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
+    fetches_.insert(fetch_target_name);
+    ++idx;
+  }
+
+  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
+      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
+    ng_op_state_ = OpState::FULL;
+  }
+
+  for (auto* op_desc : ops_desc) {
+    if (op_desc->Type().find("_grad") != std::string::npos) {
+      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
+                                                   : OpState::PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (ng_op_state_ != OpState::FULL_TRAIN &&
+      ng_op_state_ != OpState::PARTIAL_TRAIN) {
+    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
+                                                 : OpState::PARTIAL_TEST;
+  }
+}
+
+void NgraphEngine::GetNgInputShape(
+    std::shared_ptr<framework::OperatorBase> op) {
+  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto* var = scope_.FindVar(var_name);
+          if (var && var->IsType<framework::LoDTensor>()) {
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+            auto& ddim = tensor_pd->dims();
+            auto ng_shape = Ddim2Shape(ddim);
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+  NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case OpState::PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+void NgraphEngine::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string input_shape_str;
+    for (auto& var_name : var_in_) {
+      auto shape = var_node_map_->at(var_name)->get_shape();
+      for (size_t i = 0; i < shape.size(); ++i) {
+        input_shape_str += std::to_string(shape.at(i));
+      }
+    }
+    func_cache_key_ = input_shape_str + func_cache_key_;
+    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(func_cache_key_);
+    } else {
+      BuildNgFunction();
+      func_cache_[func_cache_key_] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphEngine::Run(const framework::Scope& scope,
+                       const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      auto ng_type = var_type_map_.at(vi);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
+                    ng_op_state_ == OpState::FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto vo = var_out_[i];
+    auto* var = scope.FindVar(vo);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(vo);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        auto pd_arr = tensor_pd->mutable_data<int>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", vo);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
+    }
+  }
+
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
+}  // NgraphEngine::Run
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ff2a743b0edb69163e674d36c56a02c0b4153
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace operators {
+
+enum class OpState {                /* nGraph support state on ops          */
+                     FULL_TRAIN,    /* Support full ops for train           */
+                     PARTIAL_TRAIN, /* Support partial ops for train        */
+                     FULL_TEST,     /* Support full list of ops for test    */
+                     PARTIAL_TEST,  /* Support partial list of ops for test */
+                     FULL,          /* All ops supported from feed to fetch */
+                     UNKNOWN        /* Output all for debug purpose         */
+};
+
+// perform graph build through bridge and execute computation
+class NgraphEngine {
+ public:
+  explicit NgraphEngine(const framework::Scope& scope,
+                        const platform::Place& place,
+                        const std::string& serialized_graph,
+                        const std::vector<int>& interval);
+
+  void Run(const framework::Scope& scope, const platform::Place& place) const;
+
+  static void EnableNgraph(const framework::ProgramDesc& program);
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache_;
+  const framework::Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  OpState ng_op_state_ = OpState::UNKNOWN;
+  std::string func_cache_key_;
+
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // prepare info for nraph engine
+  void Prepare(const framework::BlockDesc& block,
+               const std::vector<int>& interval);
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3051ca123b29658d3e9a35239ad00f621a297cb5
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDispensable();
+    AddOutput("Ys", "A list of outputs").AsDispensable();
+    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
+    AddComment("ngraph engine operator.");
+  }
+};
+
+class NgraphEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
+                  ops::NgraphEngineOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ngraph_engine,
+    ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2974298b0707575624ad2f6935e83d06b4c83bb
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NgraphEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    auto place = ctx.GetPlace();
+    std::string serialized_graph = ctx.Attr<std::string>("graph");
+    auto interval = ctx.Attr<std::vector<int>>("interval");
+
+    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    ngraph_engine.Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index b6c7c67c13cc54a4ccdfb4e33795cad76d8179c8..4b7aa3393b40054cb9e51152ea8fc4ac6548c008 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "ops/fill_constant_op.h"
 #include "ops/mean_op.h"
 #include "ops/mul_op.h"
+#include "ops/pool2d_op.h"
 #include "ops/scale_op.h"
 #include "ops/softmax_op.h"
 #include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..836c9d6c185b305d3dd4c9e9d30e23abb0c1431c
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -0,0 +1,174 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildPool2dNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto x_shape = x->get_shape();
+
+  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
+  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
+                    "Handling 2d pooling only");
+
+  if (op_attrs.Get<bool>("global_pooling")) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_shape.at(i + 2));
+    }
+  }
+
+  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
+                               static_cast<size_t>(ksize.at(1))};
+  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                             static_cast<size_t>(strides.at(1))};
+
+  auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
+    return (in - k + 2 * p) / s + 1;
+  };
+
+  if (op_attrs.Get<bool>("ceil_mode")) {
+    auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+    auto dummpy_shape = dummy_out->get_shape();
+    for (size_t i = 0; i < ng_padding_above.size(); ++i) {
+      auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
+                                              paddings[i], strides[i]);
+      if (desired_size != dummpy_shape[i + 2]) {
+        ng_padding_above[i] += strides[i];
+      }
+    }
+  }
+
+  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
+  if (pooling_type == "max") {
+    auto pool2d = std::make_shared<ngraph::op::MaxPool>(
+        x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above);
+    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
+  } else if (pooling_type == "avg") {
+    std::shared_ptr<ngraph::Node> pool2d;
+    if (op_attrs.Get<bool>("adaptive")) {
+      auto ComputeAdaptive = [](size_t in, size_t k) {
+        return std::floor(in / k);
+      };
+      ng_strides[0] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[0];
+      ng_strides[1] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[1];
+      pool2d =
+          std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
+    } else {
+      pool2d = std::make_shared<ngraph::op::AvgPool>(
+          x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
+          !padding_exclusive);
+    }
+    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
+  } else {
+    PADDLE_THROW("Support max and avg pooling only");
+  }
+}
+
+void BuildPool2dGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto x_shape = x->get_shape();
+
+  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
+  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
+                    "Handling 2d pooling only");
+
+  if (op_attrs.Get<bool>("global_pooling")) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_shape.at(i + 2));
+    }
+  }
+
+  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
+                               static_cast<size_t>(ksize.at(1))};
+  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                             static_cast<size_t>(strides.at(1))};
+
+  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
+  if (pooling_type == "max") {
+    auto pool2d_grad = std::make_shared<ngraph::op::MaxPoolBackprop>(
+        x, dout, out, ng_ksize_shape, ng_strides, ng_padding_below,
+        ng_padding_above);
+    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
+  } else if (pooling_type == "avg") {
+    std::shared_ptr<ngraph::Node> pool2d_grad;
+    if (op_attrs.Get<bool>("adaptive")) {
+      auto ComputeAdaptive = [](size_t in, size_t k) {
+        return std::floor(in / k);
+      };
+      ng_strides[0] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[0];
+      ng_strides[1] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[1];
+      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
+          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
+          ng_padding_above, !padding_exclusive);
+    } else {
+      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
+          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
+          ng_padding_above, !padding_exclusive);
+    }
+    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
+  } else {
+    PADDLE_THROW("Support max and avg pooling only");
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
index 58a465d87a8c0da50e3eb80fefe32d50217f6990..2a3e80c9152b5550631f8c5669283b782f975d4e 100644
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
 
-    int thread_num = Attr<int>("thread_num");
-    std::vector<std::string> slots = Attr<std::vector<std::string>>("slots");
-    int batch_size = Attr<int>("batch_size");
-    std::vector<std::string> file_list =
-        Attr<std::vector<std::string>>("file_list");
-    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size,
-                                           thread_num, slots, file_list));
+    auto thread_num = Attr<int>("thread_num");
+    auto sparse_slots = Attr<std::vector<std::string>>("sparse_slots");
+    auto dense_slot_index = Attr<std::vector<int>>("dense_slot_index");
+    auto sparse_slot_index = Attr<std::vector<int>>("sparse_slot_index");
+    auto batch_size = Attr<int>("batch_size");
+    auto file_type = Attr<std::string>("file_type");
+    auto file_format = Attr<std::string>("file_format");
+    auto file_list = Attr<std::vector<std::string>>("file_list");
+    DataDesc data_desc(batch_size, file_list, file_type, file_format,
+                       dense_slot_index, sparse_slot_index, sparse_slots);
+    VLOG(1) << data_desc;
+    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), thread_num,
+                                           data_desc));
   }
 };
 
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
              "Name of the `LoDTensorBlockingQueueHolder` variable");
     AddAttr<int>("thread_num", "the thread num to read data");
     AddAttr<int>("batch_size", "the batch size of read data");
+    AddAttr<std::string>("file_type", "plain or gzip").SetDefault("plain");
+    AddAttr<std::string>("file_format", "svm or csv").SetDefault("csv");
     AddAttr<std::vector<std::string>>("file_list",
                                       "The list of files that need to read");
-    AddAttr<std::vector<std::string>>(
-        "slots", "the slots that should be extract from file");
+    AddAttr<std::vector<int>>(
+        "dense_slot_index",
+        "the dense slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<int>>(
+        "sparse_slot_index",
+        "the sparse slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("sparse_slots",
+                                      "the sparse slots id that should be "
+                                      "extract from file, used when file "
+                                      "format is svm");
 
     AddComment(R"DOC(
 			Create CTRReader to support read ctr data with cpp.
diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc
index d1d3ddc89dc09a185e6a41274cf382b430ec3eeb..f08798794a2f9fc042800583cbc032d6f12bf3dc 100644
--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@@ -73,6 +73,9 @@ static inline void parse_line(
   }
 }
 
+// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+static inline void parse_svm_line(const std::string& line) {}
+
 class Reader {
  public:
   virtual ~Reader() {}
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
   igzstream gzstream_;
 };
 
-class MultiGzipReader : public Reader {
+class PlainFileReader : public Reader {
  public:
-  explicit MultiGzipReader(const std::vector<std::string>& file_list) {
+  explicit PlainFileReader(const std::string& file_name)
+      : stream_(file_name.c_str()) {}
+
+  ~PlainFileReader() {}
+
+  bool HasNext() override { return stream_.peek() != EOF; }
+
+  void NextLine(std::string* line) override { std::getline(stream_, *line); }
+
+ private:
+  std::ifstream stream_;
+};
+
+template <typename SingleFileReader>
+class MultiFileReader : public Reader {
+ public:
+  explicit MultiFileReader(const std::vector<std::string>& file_list) {
     for (auto& file : file_list) {
-      readers_.emplace_back(std::make_shared<GzipReader>(file));
+      readers_.emplace_back(std::make_shared<SingleFileReader>(file));
     }
   }
 
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
   }
 
  private:
-  std::vector<std::shared_ptr<GzipReader>> readers_;
+  std::vector<std::shared_ptr<SingleFileReader>> readers_;
   size_t current_reader_index_ = 0;
 };
 
 void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
                    std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "monitor thread in";
+  VLOG(3) << "monitor thread in";
   bool reader_thread_is_running = true;
   while (reader_thread_is_running) {
-    VLOG(30) << "reader_thread_is_running";
+    VLOG(3) << "reader_thread_is_running";
     reader_thread_is_running = false;
     for (size_t i = 0; i < (*thread_status).size(); ++i) {
       if ((*thread_status)[i] == Running) {
-        VLOG(30) << "reader is running!";
+        VLOG(3) << "reader is running!";
         reader_thread_is_running = true;
       }
     }
     std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   }
-  VLOG(30) << "all reader thread is stopped, push empty data into queue";
-  queue->Push({});
-  VLOG(30) << "monitor thread exited";
+  VLOG(3) << "all reader thread is stopped, close the queue";
+  queue->Close();
+  VLOG(3) << "monitor thread exited";
 }
 
-void ReadThread(const std::vector<std::string>& file_list,
-                const std::vector<std::string>& slots, int batch_size,
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
-                std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "[" << thread_id << "]"
-           << " reader thread start! thread_id = " << thread_id;
-  for (auto& file : file_list) {
-    VLOG(30) << "[" << thread_id << "]"
-             << " file " << file;
-  }
-  (*thread_status)[thread_id] = Running;
-  VLOG(30) << "set status to running";
-
+void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
   std::unordered_map<std::string, size_t> slot_to_index;
-  for (size_t i = 0; i < slots.size(); ++i) {
-    slot_to_index[slots[i]] = i;
+  for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) {
+    slot_to_index[data_desc.sparse_slot_ids_[i]] = i;
   }
 
   std::string line;
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
   std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
   std::vector<int64_t> batch_label;
 
-  MultiGzipReader reader(file_list);
-
-  VLOG(30) << "reader inited";
-
-  while (reader.HasNext()) {
+  while (reader->HasNext()) {
     batch_data.clear();
-    batch_data.reserve(batch_size);
+    batch_data.reserve(data_desc.batch_size_);
 
     batch_label.clear();
-    batch_label.reserve(batch_size);
+    batch_label.reserve(data_desc.batch_size_);
 
     // read batch_size data
-    for (int i = 0; i < batch_size; ++i) {
-      if (reader.HasNext()) {
-        reader.NextLine(&line);
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
+      if (reader->HasNext()) {
+        reader->NextLine(&line);
         std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
         int64_t label;
         parse_line(line, slot_to_index, &label, &slot_to_data);
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
 
     std::vector<framework::LoDTensor> lod_datas;
 
-    // first insert tensor for each slots
-    for (auto& slot : slots) {
+    // first insert tensor for each sparse_slots
+    for (auto& slot : data_desc.sparse_slot_ids_) {
       std::vector<size_t> lod_data{0};
       std::vector<int64_t> batch_feasign;
 
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
     lod_datas.push_back(label_tensor);
 
     queue->Push(lod_datas);
-    VLOG(40) << "push one data, queue_size=" << queue->Size();
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+
+// label dense_fea,dense_fea sparse_fea,sparse_fea
+static inline void parse_csv_line(
+    const std::string& line, const DataDesc& data_desc, int64_t* label,
+    std::vector<std::vector<float>>* dense_datas,
+    std::vector<std::vector<int64_t>>* sparse_datas) {
+  std::vector<std::string> ret;
+  string_split(line, ' ', &ret);
+  *label = std::stol(ret[0]);
+  dense_datas->resize(data_desc.dense_slot_index_.size());
+  for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.dense_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<float> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      (*dense_datas)[i].push_back(std::stof(data_str));
+    }
+  }
+  sparse_datas->resize(data_desc.sparse_slot_index_.size());
+  for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.sparse_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<int64_t> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      auto id = std::stol(data_str);
+      (*sparse_datas)[i].push_back(id);
+    }
+  }
+}
+
+void ReadCsvData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  std::string line;
+  while (reader->HasNext()) {
+    std::vector<int64_t> batch_label;
+    batch_label.reserve(data_desc.batch_size_);
+
+    std::vector<std::vector<std::vector<float>>> batch_dense_data;
+    batch_dense_data.reserve(data_desc.batch_size_);
+
+    std::vector<std::vector<std::vector<int64_t>>> batch_sparse_data;
+    batch_sparse_data.reserve(data_desc.batch_size_);
+
+    // read batch_size data
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
+      if (reader->HasNext()) {
+        reader->NextLine(&line);
+        int64_t label;
+        std::vector<std::vector<float>> dense_datas;
+        std::vector<std::vector<int64_t>> sparse_datas;
+        parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas);
+        batch_label.push_back(label);
+        if (!batch_dense_data.empty()) {
+          PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(),
+                            "dense data should have the same shape");
+        }
+        batch_dense_data.push_back(dense_datas);
+        batch_sparse_data.push_back(sparse_datas);
+      } else {
+        break;
+      }
+    }
+
+    // the order of output data is label, dense_datas, sparse_datas
+    std::vector<framework::LoDTensor> lod_datas;
+
+    // insert label tensor
+    framework::LoDTensor label_tensor;
+    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
+        framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
+        platform::CPUPlace());
+    memcpy(label_tensor_data, batch_label.data(),
+           batch_label.size() * sizeof(int64_t));
+    lod_datas.push_back(label_tensor);
+
+    // insert tensor for each dense_slots
+    for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+      framework::LoDTensor lod_tensor;
+      size_t width = batch_dense_data[0][i].size();
+      auto* tensor_data = lod_tensor.mutable_data<float>(
+          framework::make_ddim(
+              {static_cast<int64_t>(batch_dense_data.size()),  // batch_size
+               static_cast<int64_t>(width)}),
+          platform::CPUPlace());
+
+      for (size_t j = 0; j < batch_dense_data.size(); ++j) {
+        auto& dense_data_row = batch_dense_data[j][i];
+        memcpy(tensor_data + j * width, dense_data_row.data(),
+               width * sizeof(float));
+      }
+
+      lod_datas.push_back(lod_tensor);
+    }
+
+    // insert tensor for each sparse_slots
+    for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+      std::vector<size_t> lod_data{0};
+      std::vector<int64_t> batch_feasign;
+
+      for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) {
+        auto& sparse_ids = batch_sparse_data[row_idx][i];
+        lod_data.push_back(lod_data.back() + sparse_ids.size());
+        batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(),
+                             sparse_ids.end());
+      }
+
+      framework::LoDTensor lod_tensor;
+      framework::LoD lod{lod_data};
+      lod_tensor.set_lod(lod);
+      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
+          platform::CPUPlace());
+      memcpy(tensor_data, batch_feasign.data(),
+             batch_feasign.size() * sizeof(int64_t));
+      lod_datas.push_back(lod_tensor);
+    }
+
+    queue->Push(lod_datas);
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+
+void ReadThread(const std::vector<std::string>& file_list,
+                const DataDesc& data_desc, int thread_id,
+                std::vector<ReaderThreadStatus>* thread_status,
+                std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  VLOG(3) << "[" << thread_id << "]"
+          << " reader thread start! thread_id = " << thread_id;
+  for (auto& file : file_list) {
+    VLOG(3) << "[" << thread_id << "]"
+            << " file " << file;
+  }
+  (*thread_status)[thread_id] = Running;
+  VLOG(3) << "set status to running";
+
+  std::shared_ptr<Reader> reader;
+  if (data_desc.file_type_ == "gzip") {
+    reader.reset(new MultiFileReader<GzipReader>(file_list));
+  } else if (data_desc.file_type_ == "plain") {
+    reader.reset(new MultiFileReader<PlainFileReader>(file_list));
+  } else {
+    PADDLE_THROW("do not support file format %s", data_desc.file_type_);
+  }
+
+  VLOG(3) << "reader inited";
+
+  if (data_desc.file_format_ == "svm") {
+    ReadSvmData(data_desc, reader, queue);
+  } else if (data_desc.file_format_ == "csv") {
+    ReadCsvData(data_desc, reader, queue);
   }
 
   (*thread_status)[thread_id] = Stopped;
-  VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
+  VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 56879ffda5d3e04a88d12d6c4701c24a0d0ee4f7..740cd5219c70331d1f71d832adef084c148a2408 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -36,9 +36,63 @@ namespace reader {
 
 enum ReaderThreadStatus { Running, Stopped };
 
+struct DataDesc {
+  DataDesc(int batch_size, const std::vector<std::string>& file_names,
+           const std::string& file_type, const std::string& file_format,
+           const std::vector<int>& dense_slot_index,
+           const std::vector<int>& sparse_slot_index,
+           const std::vector<std::string>& sparse_slot_ids)
+      : batch_size_(batch_size),
+        file_names_(file_names),
+        file_type_(file_type),
+        file_format_(file_format),
+        dense_slot_index_(dense_slot_index),
+        sparse_slot_index_(sparse_slot_index),
+        sparse_slot_ids_(sparse_slot_ids) {}
+
+  const int batch_size_;
+  const std::vector<std::string> file_names_;
+  const std::string file_type_;    // gzip or plain
+  const std::string file_format_;  // csv or svm
+  // used for csv data format
+  const std::vector<int> dense_slot_index_;
+  const std::vector<int> sparse_slot_index_;
+  // used for svm data format
+  const std::vector<std::string> sparse_slot_ids_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) {
+  os << "data_desc:\n";
+  os << "\tbatch_size -> " << data_desc.batch_size_ << "\n";
+  os << "\tfile_type -> " << data_desc.file_type_ << "\n";
+  os << "\tfile_format -> " << data_desc.file_format_ << "\n";
+  os << "\tfile_names -> {";
+  for (auto& file_name : data_desc.file_names_) {
+    os << file_name << ",";
+  }
+  os << "}\n";
+  os << "\tdense_slot_index -> {";
+  for (auto& slot : data_desc.dense_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_index_ -> {";
+  for (auto& slot : data_desc.sparse_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_ids_ -> {";
+  for (auto& slot : data_desc.sparse_slot_ids_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+
+  return os;
+}
+
 void ReadThread(const std::vector<std::string>& file_list,
-                const std::vector<std::string>& slots, int batch_size,
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
+                const DataDesc& data_desc, int thread_id,
+                std::vector<ReaderThreadStatus>* thread_status,
                 std::shared_ptr<LoDTensorBlockingQueue> queue);
 
 // monitor all running thread, if they are all stopped,
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
 
 class CTRReader : public framework::FileReader {
  public:
-  explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
-                     int batch_size, size_t thread_num,
-                     const std::vector<std::string>& slots,
-                     const std::vector<std::string>& file_list)
-      : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
+  CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
+            int thread_num, const DataDesc& data_desc)
+      : data_desc_(data_desc) {
     PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
+    PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0,
+                      "file list should not be empty");
+
+    thread_num_ = std::min<size_t>(data_desc_.file_names_.size(), thread_num);
     queue_ = queue;
     SplitFiles();
     for (size_t i = 0; i < thread_num_; ++i) {
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
     }
   }
 
-  ~CTRReader() {}
+  ~CTRReader() { Shutdown(); }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     bool success;
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
     for (auto& read_thread : read_threads_) {
       read_thread->join();
     }
-    monitor_thread_->join();
+
+    if (monitor_thread_) {
+      monitor_thread_->join();
+    }
 
     read_threads_.clear();
     monitor_thread_.reset(nullptr);
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
       read_threads_.emplace_back(new std::thread(std::bind(
-          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          &ReadThread, file_groups_[thread_id], data_desc_,
           static_cast<int>(thread_id), &read_thread_status_, queue_)));
     }
     monitor_thread_.reset(new std::thread(
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
  private:
   void SplitFiles() {
     file_groups_.resize(thread_num_);
-    for (size_t i = 0; i < file_list_.size(); ++i) {
-      auto& file_name = file_list_[i];
+    for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) {
+      auto& file_name = data_desc_.file_names_[i];
       std::ifstream f(file_name.c_str());
       PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
       file_groups_[i % thread_num_].push_back(file_name);
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
 
  private:
   size_t thread_num_;
-  const int batch_size_;
-  const std::vector<std::string> slots_;
-  const std::vector<std::string> file_list_;
+  const DataDesc data_desc_;
   std::shared_ptr<LoDTensorBlockingQueue> queue_;
   std::vector<std::unique_ptr<std::thread>> read_threads_;
   std::unique_ptr<std::thread> monitor_thread_;
diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc
index 8dba9baebce0a82ee2a541fe6ae9f6bcef8e2835..9f3a254c84d4e04fbcd449644a7e138eff520fbc 100644
--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
 using paddle::framework::DDim;
 using paddle::platform::CPUPlace;
 using paddle::framework::make_ddim;
+using paddle::operators::reader::DataDesc;
 
 static void generatedata(const std::vector<std::string>& data,
                          const std::string& file_name) {
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
 
   LoDTensorBlockingQueueHolder queue_holder;
   int capacity = 64;
-  queue_holder.InitOnce(capacity, {}, false);
+  queue_holder.InitOnce(capacity, false);
 
   std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
 
   int batch_size = 3;
   int thread_num = 1;
-  std::vector<std::string> slots = {"6002", "6003"};
+  std::vector<std::string> sparse_slots = {"6002", "6003"};
   std::vector<std::string> file_list;
   for (int i = 0; i < thread_num; ++i) {
     file_list.push_back(gz_file_name);
   }
 
-  CTRReader reader(queue, batch_size, thread_num, slots, file_list);
+  DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {},
+                     sparse_slots);
+
+  CTRReader reader(queue, thread_num, data_desc);
 
   reader.Start();
   size_t batch_num =
       std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
 
   reader.Shutdown();
 
   reader.Start();
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
   reader.Shutdown();
 }
+
+static void GenereteCsvData(const std::string& file_name,
+                            const std::vector<std::string>& data) {
+  std::ofstream out(file_name.c_str());
+  PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
+  for (auto& c : data) {
+    out << c;
+  }
+  out.close();
+  PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
+}
+
+static void CheckReadCsvOut(const std::vector<LoDTensor>& out) {
+  ASSERT_EQ(out.size(), 3);
+  ASSERT_EQ(out[0].dims()[1], 1);
+  ASSERT_EQ(out[1].dims()[1], 2);
+  ASSERT_EQ(out[2].dims()[1], 1);
+  for (size_t i = 0; i < out[0].numel(); ++i) {
+    int64_t label = out[0].data<int64_t>()[i];
+    auto& dense_dim = out[1].dims();
+    for (size_t j = 0; j < dense_dim[1]; ++j) {
+      ASSERT_EQ(out[1].data<float>()[i * dense_dim[1] + j],
+                static_cast<float>(label + 0.1));
+    }
+    auto& sparse_lod = out[2].lod();
+    for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) {
+      ASSERT_EQ(out[2].data<int64_t>()[j], label);
+    }
+  }
+}
+
+TEST(CTR_READER, read_csv_data) {
+  std::string file_name = "test_ctr_reader_data.csv";
+  const std::vector<std::string> csv_data = {
+      "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n",
+      "3 3.1,3.1 3,3,3,3\n",
+  };
+  GenereteCsvData(file_name, csv_data);
+
+  LoDTensorBlockingQueueHolder queue_holder;
+  int capacity = 64;
+  queue_holder.InitOnce(capacity, false);
+
+  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
+
+  int batch_size = 3;
+  int thread_num = 1;
+  std::vector<std::string> file_list;
+  for (int i = 0; i < thread_num; ++i) {
+    file_list.push_back(file_name);
+  }
+  DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {});
+
+  CTRReader reader(queue, thread_num, data_desc);
+
+  for (size_t i = 0; i < 2; ++i) {
+    reader.Start();
+    std::vector<LoDTensor> out;
+    while (true) {
+      reader.ReadNext(&out);
+      if (out.empty()) {
+        break;
+      }
+      CheckReadCsvOut(out);
+    }
+    reader.Shutdown();
+  }
+}
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 3f041ff7e4e32b407729a22aab25d3aab199fee0..5b53edff5d8ea79a03542231dbf34f5a6f254986 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
   friend class LoDTensorBlockingQueueHolder;
 
  private:
-  LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims,
-                         bool speed_test_mode = false)
-      : queue_(capacity, speed_test_mode), dims_(dims) {}
+  explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
 
  private:
   BlockingQueue<std::vector<framework::LoDTensor>> queue_;
-  std::vector<framework::DDim> dims_;
 };
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
-                bool speed_test_mode = false) {
+  void InitOnce(size_t capacity, bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index a0b70938d354cbb3bf10a9c8c589ba5153624f45..8fe638ac2fdc6e0baed7d6cd3c57b72f23164129 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
                    "The ReadOp must take a reader as input.");
     PADDLE_ENFORCE(ctx->HasOutputs("Out"),
                    "The ReadOp should be assigned with output.");
-    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
-    std::vector<std::string> out_names = ctx->Outputs("Out");
-    PADDLE_ENFORCE_EQ(
-        reader_dims.size(), out_names.size(),
-        "The reader's dim number doesn't match the output number.");
-    ctx->SetOutputsDim("Out", reader_dims);
-    if (!ctx->IsRuntime()) {
+    if (!ctx->IsRuntime() && ctx->Attrs().Get<bool>("infer_out")) {
+      std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+      std::vector<std::string> out_names = ctx->Outputs("Out");
+      PADDLE_ENFORCE_EQ(
+          reader_dims.size(), out_names.size(),
+          "The reader's dim number doesn't match the output number.");
+      ctx->SetOutputsDim("Out", reader_dims);
       auto in_desc =
           boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]);
       auto in_lod_levels = in_desc->GetLoDLevels();
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Input("Reader")[0];
-    std::vector<std::string> out_names = op_desc.Output("Out");
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    auto dtypes = reader->GetDataTypes();
-    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-    for (size_t i = 0; i < dtypes.size(); ++i) {
-      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-      out.SetType(framework::proto::VarType::LOD_TENSOR);
-      out.SetDataType(dtypes[i]);
+    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
+    if (infer_out) {
+      std::string reader_name = op_desc.Input("Reader")[0];
+      std::vector<std::string> out_names = op_desc.Output("Out");
+      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+      auto dtypes = reader->GetDataTypes();
+      PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+      for (size_t i = 0; i < dtypes.size(); ++i) {
+        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+        out.SetType(framework::proto::VarType::LOD_TENSOR);
+        out.SetDataType(dtypes[i]);
+      }
     }
   }
 };
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    VLOG(3) << "read op in";
     framework::ReaderHolder* reader =
         detail::Ref(scope.FindVar(Input("Reader")),
                     "Cannot find reader variable %s", Input("Reader"))
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
+      VLOG(3) << "read empty data in";
       if (Attr<bool>("throw_eof_exp")) {
+        VLOG(3) << "throw_eof_exp";
         PADDLE_THROW_EOF();
       } else {
         ins.resize(out_arg_names.size());
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
           tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
         }
       }
+      VLOG(3) << "read empty data out";
     }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < out_arg_names.size(); ++i) {
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
         " only when the data-balance is enabled in ParallelExecutor"
         " and it is set by ParallelExecutor instance, not users.")
         .SetDefault(true);
+    AddAttr<bool>("infer_out", "").SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index b82aab1214992be73d876a42424234e3cea46455..3921eedf94abbe68bed035940913f830a6c16e48 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
       "It means the reader will generate two data each time,"
       "whose shapes are [2,3,4] and [5,6] respectively.");
   AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+  AddAttr<bool>(
+      "use_data_config",
+      "Use the config of all datas like shape_concat/ranks/lod_levels")
+      .SetDefault(true);
   Apply();
 }
 
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
 
   PADDLE_ENFORCE(ctx->HasOutput("Out"),
                  "The output file reader should not be null.");
-  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
-  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-  ctx->SetReaderDims("Out", shapes);
-
-  const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-  PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
-                    "The number of 'lod_levels'(%d) doesn't match the number "
-                    "of 'shapes'(%d).",
-                    lod_levels.size(), shapes.size());
-  framework::VarDesc* reader =
-      boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-  reader->SetLoDLevels(lod_levels);
+  bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
+  if (use_data_config) {
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+                      "The number of 'lod_levels'(%d) doesn't match the number "
+                      "of 'shapes'(%d).",
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
 }
 
 void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 8eab3a6f891f1dfa91c5ce316f1419df2cd42248..32365d6a9602fa8ad2c01b59c8cd361d52ed973f 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -330,6 +330,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -356,16 +357,20 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
+                                int64_t, ops::ReshapeKernel, plat::float16,
+                                ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel, plat::float16,
                                 ops::ReshapeGradKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
+                                int64_t, ops::ReshapeKernel, plat::float16,
+                                ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel, plat::float16,
                                 ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index bcec6f3563df7f4e1e48554cc891d596f9e56024..8d695fdedd04055215864ca4f0a7059ed7a5d6b0 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -255,8 +256,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
 
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -270,14 +271,18 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-    GPUROIAlignForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio,
-        roi_batch_id_list_gpu.data<int>(),
+        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
         out->mutable_data<T>(ctx.GetPlace()));
   }
 };
@@ -307,8 +312,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     for (int n = 0; n < rois_batch_size; ++n) {
@@ -316,24 +321,28 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int));
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
+    set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
     int blocks = NumBlocks(output_grad_size);
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPUROIAlignBackward<
-          T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
           output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
           spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_batch_id_list_gpu.data<int>(),
+          sampling_ratio, roi_id_data,
           in_grad->mutable_data<T>(ctx.GetPlace()));
     }
   }
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 75c3dd6bc498e35c6249f79a1c24cfe17316670e..ac3a4201e65256ae16c3376b385dd6000da60fe6 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -152,8 +153,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     framework::Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -168,15 +169,20 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    framework::Tensor roi_batch_id_list_gpu;
-    framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &roi_batch_id_list_gpu);
-
-    GPUROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width,
-        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
+        height, width, pooled_height, pooled_width, roi_id_data,
+        out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
@@ -204,8 +210,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       framework::Tensor roi_batch_id_list;
       roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto cplace = platform::CPUPlace();
+      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
       for (int n = 0; n < rois_batch_size; ++n) {
@@ -213,25 +219,30 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_data[i] = n;
         }
       }
-      framework::Tensor roi_batch_id_list_gpu;
-      framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &roi_batch_id_list_gpu);
+
+      auto& dev_ctx = ctx.cuda_device_context();
+      auto& allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      int bytes = roi_batch_id_list.numel() * sizeof(int);
+      auto roi_ptr = allocator.Allocate(bytes);
+      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                   dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+      set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
             output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width,
-            roi_batch_id_list_gpu.data<int>(),
+            width, pooled_height, pooled_width, roi_id_data,
             x_grad->mutable_data<T>(ctx.GetPlace()));
       }
     }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index c07e6962e673ceb274ef31cbf492f378ae696137..27e0201bd70df59c58eaa7567d5bb69eb1b721b4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -68,6 +68,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "Level number of Input(X)'s lod could be 0. Otherwise "
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
+      } else {
+        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+                          "When Input(X)'s lod is null, the dims[0] of "
+                          "Input(X) should match the "
+                          "size of Input(Y)'s referred level lod.");
       }
 
       int64_t out_first_dim = 0;
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9349912e090f2ad3248923c87b50c8d72b0d84d1
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -0,0 +1,113 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ShuffleChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ShuffleChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ShuffleChannelOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of ShuffleChannelOp, the layout is NCHW.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "ShuffleChannelOp. The layout is NCHW.");
+    AddAttr<int>("group", "the number of groups.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& group) {
+          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
+        });
+
+    AddComment(R"DOC(
+		Shuffle Channel operator
+		This opearator shuffles the channels of input x.
+		It  divide the input channels in each group into several subgroups,
+		and obtain a new order by selecting element from every subgroup one by one.
+
+		Shuffle channel operation makes it possible to build more powerful structures
+		with multiple group convolutional layers.
+		please get more information from the following paper:
+		https://arxiv.org/pdf/1707.01083.pdf
+        )DOC");
+  }
+};
+
+class ShuffleChannelGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) should not be null");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
+                  ops::ShuffleChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9506343b3d508459c6e10dc68eba13504b07338f
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void ShuffleChannel(const int nthreads, const int feature_map_size,
+                               T* output, const T* input, int group_row,
+                               int group_column, int len) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t ii = index; ii < nthreads; ii += offset) {
+    const int n = index / group_row / group_column / len;
+    const int i = (index / group_column / len) % group_row;
+    const int j = index / len % group_column;
+    const int k = index - (n * feature_map_size + (i * group_column + j) * len);
+    T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
+    p_o[k] = input[index];
+  }
+}
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    // count is the product of NCHW same as numel()
+    int count = num * group_column * group_row * sp_sz;
+
+    int blocks = NumBlocks(output->numel());
+    int threads = kNumCUDAThreads;
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, output_data, input_data, group_row,
+        group_column, sp_sz);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+
+    int group_row = group;
+    int group_column = channel / group_row;
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+
+    int blocks = NumBlocks(output_grad->numel());
+    int threads = kNumCUDAThreads;
+    int count = num * group_column * group_row * sp_sz;
+
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, input_grad_data, output_grad_data, group_row,
+        group_column, sp_sz);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        float>,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6af1bc88598870ebccef81bd37f93f376940851
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = input_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o =
+              output_data + n * feature_map_size + (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+
+    int group_row = group;
+    int group_column = channel / group_row;
+
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = output_grad_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o = input_grad_data + n * feature_map_size +
+                   (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 14746fa95159d707be7c10c69a4ffc2211e17a93..c21b0c13c752b82b80c120cb5a5d4a010ef18287 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -101,6 +101,10 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<bool>("normalize",
+                  "if true, divide the loss by the number of "
+                  "targets != ignore_index.")
+        .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default kIgnoreIndex), Specifies a target value that "
                  "is ignored and"
@@ -145,9 +149,14 @@ REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_cross_entropy_with_logits,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             float>,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             double>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+                           paddle::platform::CPUDeviceContext, float>,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index a1fbc7e5fab71df486b53c31464c99e9c4557ccd..2a4570ef5cec0bee07efd69a2efd1a079ff33df5 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,12 +11,184 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+static HOSTDEVICE float real_log(float x) { return logf(x); }
+static HOSTDEVICE float real_log(double x) { return log(x); }
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
+                                  const int ignore_index, const int limit,
+                                  T *out_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
+      out_data[i] = term1 - term2 + term3;
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T in = 0;
+  for (int i = threadIdx.x; i < num; i += BlockDim) {
+    in += counts[i];
+  }
+  __syncthreads();
+  auto out =
+      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    T a = out > eps ? out : eps;
+    sum[0] = a;
+  }
+}
+
+template <typename T>
+__global__ void Div(T *loss, const int num, const T *norm) {
+  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+}
+
+template <typename T>
+__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
+                                   const int ignore_index, const T *dout_data,
+                                   const int limit, T *dx_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T dout = dout_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data[i] = dout * diff;
+      counts[i] = 1;
+    }
+  }
+}
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
+    int ignore_index = context.Attr<int>("ignore_index");
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.cuda_device_context();
+    bool normalize = context.Attr<bool>("normalize");
+
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = Out->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+    }
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsGradKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
+
+    int ignore_index = context.Attr<int>("ignore_index");
+
+    auto &dev_ctx = context.cuda_device_context();
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = dX->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
+        dx_data, counts);
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index 6e75f9e0b8d825b100de5c46f151a808cdb1b9d5..8f459d573ae5930c27a97c39ac79231384c3d12f 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -13,54 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/eigen.h"
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsForward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T term1 = (x > 0) ? x : 0;
-    T term2 = x * label;
-    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
-    return term1 - term2 + term3;
-  }
-
-  int ignore_index;
-};
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsBackward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-    return simoid_x - label;
-  }
-
-  int ignore_index;
-};
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -70,16 +30,37 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     const Tensor *X = context.Input<Tensor>("X");
     const Tensor *Labels = context.Input<Tensor>("Label");
     Tensor *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
     int ignore_index = context.Attr<int>("ignore_index");
-
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto out = EigenVector<T>::Flatten(*Out);
-    auto &place = *context.device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.binaryExpr(
-        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+    int limit = Out->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        out_data[idx] = static_cast<T>(0.);
+      } else {
+        T term1 = (x > 0) ? x : 0;
+        T term2 = x * label;
+        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+        out_data[idx] = term1 - term2 + term3;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
@@ -92,19 +73,39 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     const Tensor *Labels = context.Input<Tensor>("Label");
     const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto dout = EigenVector<T>::Flatten(*dOut);
-    auto dx = EigenVector<T>::Flatten(*dX);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
 
-    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
-                                         static_cast<int>(ignore_index)));
-    dx.device(place) = dout * diff;
+    int ignore_index = context.Attr<int>("ignore_index");
+    int limit = dX->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    auto dout_data = dOut->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      T dout = dout_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        dx_data[idx] = static_cast<T>(0.);
+      } else {
+        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+        T diff = simoid_x - label;
+        dx_data[idx] = dout * diff;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 789e61b2d332b9391ef45a8ebe58ad0f1a4d2bf0..94995fc99612adb1164e60f1a51747f74eacfb73 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -54,6 +54,9 @@ class SliceOp : public framework::OperatorWithKernel {
       out_dims[axes[i]] = end - start;
     }
     ctx->SetOutputDim("Out", out_dims);
+    if (axes[0] != 0) {
+      ctx->ShareLoD("Input", /*->*/ "Out");
+    }
   }
 
  protected:
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index bf2a9e5b3d22996e688621727cb280dc9aed7859..24d0b2f906a8e0b360c3f477c9290ebe5d57a3ff 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -17,13 +17,16 @@
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
-                        ops::StackKernel<plat::CUDADeviceContext, double>,
-                        ops::StackKernel<plat::CUDADeviceContext, int>,
-                        ops::StackKernel<plat::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    stack, ops::StackKernel<plat::CUDADeviceContext, float>,
+    ops::StackKernel<plat::CUDADeviceContext, double>,
+    ops::StackKernel<plat::CUDADeviceContext, int>,
+    ops::StackKernel<plat::CUDADeviceContext, int64_t>,
+    ops::StackKernel<plat::CUDADeviceContext, plat::float16>);
 
-REGISTER_OP_CUDA_KERNEL(stack_grad,
-                        ops::StackGradKernel<plat::CUDADeviceContext, float>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, double>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, int>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    stack_grad, ops::StackGradKernel<plat::CUDADeviceContext, float>,
+    ops::StackGradKernel<plat::CUDADeviceContext, double>,
+    ops::StackGradKernel<plat::CUDADeviceContext, int>,
+    ops::StackGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::StackGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index b993c55fad13e892efd51648b78704bec83bf2b4..031335009b692f9d1f73070c88e8e79d852cbe36 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different TRT Engines");
     AddAttr<int>("max_batch_size", "the maximum batch size.");
     AddAttr<int>("workspace_size", "the workspace size.");
+    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
+    AddAttr<bool>("enable_int8", "whether swith to int8 mode");
     AddComment("TensorRT engine operator.");
   }
 };
@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 88c4f508474e66953b79fb92ff1eb0b53a539f07..2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -17,8 +17,10 @@
 #ifdef PADDLE_WITH_CUDA
 
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
 
 class TensorRTEngineOp : public framework::OperatorBase {
  private:
@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
   mutable std::unique_ptr<TensorRTEngine> trt_engine_;
   int max_batch_size_;
   int workspace_size_;
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  bool enable_int8_;
+  std::string calibration_data_;
+  std::string engine_key_;
+  bool calibration_mode_;
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -80,26 +90,108 @@ class TensorRTEngineOp : public framework::OperatorBase {
     input_names_ = Inputs("Xs");
     max_batch_size_ = Attr<int>("max_batch_size");
     workspace_size_ = Attr<int>("workspace_size");
+    enable_int8_ = Attr<bool>("enable_int8");
+    calibration_data_ = Attr<std::string>("calibration_data");
+    engine_key_ = Attr<std::string>("engine_key");
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
       param_names_.insert(param);
     }
+    // calibration_mode is ture represents we need to
+    // generate the calibration table data.
+    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
+
+    VLOG(4) << "calibration_mode: " << calibration_mode_;
+    if (enable_int8_ && calibration_data_.size()) {
+      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
+    }
   }
 
  protected:
+  void RunNativeImpl(const framework::Scope &scope,
+                     const platform::Place &dev_place) const {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>("sub_block");
+    auto *program = block->Program();
+    auto &current_scope = scope.NewScope();
+    auto ctx = executor.Prepare(*program, block->ID());
+    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
+  }
+
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
+    if (calibration_mode_ == true) {
+      RunCalibration(scope, dev_place);
+      return;
+    }
     RunTrt(scope, dev_place);
   }
 
+  void RunCalibration(const framework::Scope &scope,
+                      const platform::Place &dev_place) const {
+    // This process will builds a 32-bit trt engine, runs it on the calibration
+    // set, and records a histogram for each
+    // tensor of the distribution of activation values.
+    LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
+                         << " is running calibration trt int8... ";
+    int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
+    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
+      TRTCalibratorEngine *calib_res =
+          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
+      std::unordered_map<std::string, size_t> calib_buffers;
+      for (auto &x : input_names_) {
+        if (param_names_.count(x)) continue;
+        auto &t =
+            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+        calib_buffers[x] = t.memory_size();
+        auto t_shape = framework::vectorize(t.dims());
+        runtime_batch = t_shape[0];
+      }
+      calib_res->calib_.reset(new TRTInt8Calibrator(
+          calib_buffers, runtime_batch, engine_key_, dev_place));
+      calib_res->thr_.reset(new std::thread([&]() {
+        calib_res->engine_.reset(new TensorRTEngine(
+            max_batch_size_, workspace_size_, stream,
+            boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
+            calib_res->calib_.get()));
+        VLOG(3) << "start the calib trt engine thread";
+        Prepare(scope, dev_place, calib_res->engine_.get());
+      }));
+    }
+
+    TRTInt8Calibrator *temp_calibrator =
+        Singleton<TRTCalibratorEngineManager>::Global()
+            .Get(engine_key_)
+            ->calib_.get();
+    std::unordered_map<std::string, void *> calib_data;
+
+    for (auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      calib_data.emplace(x, t.data<void>());
+    }
+    temp_calibrator->setBatch(calib_data);
+    RunNativeImpl(scope, dev_place);
+  }
+
   void RunTrt(const framework::Scope &scope,
               const platform::Place &dev_place) const {
     int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
     if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(new TensorRTEngine(
-          max_batch_size_, workspace_size_, nullptr,
-          boost::get<platform::CUDAPlace>(dev_place).device));
+      trt_engine_.reset(
+          new TensorRTEngine(max_batch_size_, workspace_size_, stream,
+                             boost::get<platform::CUDAPlace>(dev_place).device,
+                             enable_int8_, calibrator_.get()));
       Prepare(scope, dev_place, trt_engine_.get());
     }
 
@@ -126,6 +218,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       }
     }
 
+    cudaStreamSynchronize(stream);
     PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
     engine->Execute(runtime_batch);
@@ -163,12 +256,13 @@ class TensorRTEngineOp : public framework::OperatorBase {
       output_index += 1;
     }
 
-    cudaStreamSynchronize(*engine->stream());
+    cudaStreamSynchronize(stream);
   }
 
   void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
                TensorRTEngine *engine) const {
-    VLOG(4) << "Prepare engine";
+    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
     framework::proto::BlockDesc block_desc;
     block_desc.ParseFromString(Attr<std::string>("subgraph"));
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 287b0edc96e5e312b0ff1725ee188ff319d44d23..5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetType("tensorrt_engine");
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
-                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
-                                    std::vector<std::string>({}));
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
-                                    "output_name_mapping",
-                                    std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
 
   LOG(INFO) << "create engine op";
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
   LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
 
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
-                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
-  SetAttr<std::vector<std::string>>(
-      engine_op_desc.Proto(), "parameters",
-      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
-
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
-                                    "output_name_mapping",
-                                    std::vector<std::string>({"z3"}));
-
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
 
   // Execute them.
   engine_op->Run(scope, place);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index b4025350fa9f3610bde43eee91cd059f3063813f..915774e5f3624f26dbd1451a99d7bf0bf75a72c8 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -15,19 +15,27 @@ limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
     ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>);
diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
index 5e16a209e712a143e1083e171f88002817aef838..a764d59410c90535dbda0b3f11e89ae9bf578c04 100644
--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
         CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
 
     T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), loss, static_cast<T>(0));
-
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size);
-    void* cudnn_workspace = temp_allocation->ptr();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
-        handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
-        warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data,
-        cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC,
-        cu_ctcloss_desc, cudnn_workspace, workspace_size));
+
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
+          handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
+          warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+          loss_data, cu_grad_desc, warpctc_grad_data,
+          CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
+          workspace_size));
+    };
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
   }
 };
 
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
deleted file mode 100644
index 0bb285722ddedf721d98237760ec9868e2134442..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Array5 = Eigen::DSizes<int64_t, 5>;
-
-template <typename T>
-static inline bool isZero(T x) {
-  return fabs(x) < 1e-6;
-}
-
-template <typename T>
-static inline T sigmoid(T x) {
-  return 1.0 / (exp(-1.0 * x) + 1.0);
-}
-
-template <typename T>
-static inline T CalcMaskPointNum(const Tensor& mask) {
-  auto mask_t = EigenVector<int>::Flatten(mask);
-  T count = 0.0;
-  for (int i = 0; i < mask_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      count += 1.0;
-    }
-  }
-  return count;
-}
-
-template <typename T>
-static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum += pow(x_t(i) - y_t(i), 2);
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
-                                const Tensor& mask, T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum +=
-          -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
-                                       const Tensor& y, const Tensor& mask,
-                                       T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
-                           Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
-                           Tensor* pred_w, Tensor* pred_h, const int anchor_num,
-                           const int class_num) {
-  const int n = input.dims()[0];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
-  const int box_attr_num = 5 + class_num;
-
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
-  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
-  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
-  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
-
-  for (int i = 0; i < n; i++) {
-    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < w; k++) {
-          pred_x_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx, j, k));
-          pred_y_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
-          pred_w_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 2, j, k);
-          pred_h_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 3, j, k);
-
-          pred_conf_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
-
-          for (int c = 0; c < class_num; c++) {
-            pred_class_t(i, an_idx, j, k, c) =
-                sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
-  T b1_x1 = box1[0] - box1[2] / 2;
-  T b1_x2 = box1[0] + box1[2] / 2;
-  T b1_y1 = box1[1] - box1[3] / 2;
-  T b1_y2 = box1[1] + box1[3] / 2;
-  T b2_x1 = box2[0] - box2[2] / 2;
-  T b2_x2 = box2[0] + box2[2] / 2;
-  T b2_y1 = box2[1] - box2[3] / 2;
-  T b2_y2 = box2[1] + box2[3] / 2;
-
-  T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
-  T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
-
-  T inter_rect_x1 = std::max(b1_x1, b2_x1);
-  T inter_rect_y1 = std::max(b1_y1, b2_y1);
-  T inter_rect_x2 = std::min(b1_x2, b2_x2);
-  T inter_rect_y2 = std::min(b1_y2, b2_y2);
-  T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
-                 std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
-
-  return inter_area / (b1_area + b2_area - inter_area);
-}
-
-template <typename T>
-static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
-                            const float ignore_thresh, std::vector<int> anchors,
-                            const int grid_size, Tensor* obj_mask,
-                            Tensor* noobj_mask, Tensor* tx, Tensor* ty,
-                            Tensor* tw, Tensor* th, Tensor* tconf,
-                            Tensor* tclass) {
-  const int n = gt_box.dims()[0];
-  const int b = gt_box.dims()[1];
-  const int anchor_num = anchors.size() / 2;
-  auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
-  auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
-  auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
-  auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
-  auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
-  auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
-  auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
-  auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
-  auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
-  auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < b; j++) {
-      if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
-          isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
-        continue;
-      }
-
-      int cur_label = gt_label_t(i, j);
-      T gx = gt_box_t(i, j, 0) * grid_size;
-      T gy = gt_box_t(i, j, 1) * grid_size;
-      T gw = gt_box_t(i, j, 2) * grid_size;
-      T gh = gt_box_t(i, j, 3) * grid_size;
-      int gi = static_cast<int>(gx);
-      int gj = static_cast<int>(gy);
-
-      T max_iou = static_cast<T>(0);
-      T iou;
-      int best_an_index = -1;
-      std::vector<T> gt_box_shape({0, 0, gw, gh});
-      for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-        std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
-                                     static_cast<T>(anchors[2 * an_idx + 1])});
-        iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
-        if (iou > max_iou) {
-          max_iou = iou;
-          best_an_index = an_idx;
-        }
-        if (iou > ignore_thresh) {
-          noobj_mask_t(i, an_idx, gj, gi) = 0;
-        }
-      }
-      obj_mask_t(i, best_an_index, gj, gi) = 1;
-      noobj_mask_t(i, best_an_index, gj, gi) = 0;
-      tx_t(i, best_an_index, gj, gi) = gx - gi;
-      ty_t(i, best_an_index, gj, gi) = gy - gj;
-      tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
-      th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
-      tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
-      tconf_t(i, best_an_index, gj, gi) = 1;
-    }
-  }
-}
-
-static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
-                                    const Tensor& obj_mask) {
-  const int n = obj_mask_expand->dims()[0];
-  const int an_num = obj_mask_expand->dims()[1];
-  const int h = obj_mask_expand->dims()[2];
-  const int w = obj_mask_expand->dims()[3];
-  const int class_num = obj_mask_expand->dims()[4];
-  auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
-  auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
-
-  obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
-                          .broadcast(Array5(1, 1, 1, 1, class_num));
-}
-
-template <typename T>
-static void AddAllGradToInputGrad(
-    Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
-    const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
-    const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
-    const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
-    const Tensor& grad_class, const int class_num, const float loss_weight_xy,
-    const float loss_weight_wh, const float loss_weight_conf_target,
-    const float loss_weight_conf_notarget, const float loss_weight_class) {
-  const int n = pred_x.dims()[0];
-  const int an_num = pred_x.dims()[1];
-  const int h = pred_x.dims()[2];
-  const int w = pred_x.dims()[3];
-  const int attr_num = class_num + 5;
-  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
-  auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
-  auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
-  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
-  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
-  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
-  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
-  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
-  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
-  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          grad_t(i, j * attr_num, k, l) =
-              grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
-              (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 1, k, l) =
-              grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
-              (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 2, k, l) =
-              grad_w_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 3, k, l) =
-              grad_h_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 4, k, l) =
-              grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
-          grad_t(i, j * attr_num + 4, k, l) +=
-              grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss *
-              loss_weight_conf_notarget;
-
-          for (int c = 0; c < class_num; c++) {
-            grad_t(i, j * attr_num + 5 + c, k, l) =
-                grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
-                (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class Yolov3LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
-    T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
-    T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
-    T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
-    T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
-    T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
-    T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
-
-    auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
-    loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
-                   loss_weight_wh * (loss_w + loss_h) +
-                   loss_weight_conf_target * loss_conf_target +
-                   loss_weight_conf_notarget * loss_conf_notarget +
-                   loss_weight_class * loss_class;
-  }
-};
-
-template <typename T>
-class Yolov3LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    const T loss = output_grad->data<T>()[0];
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    Tensor grad_x, grad_y, grad_w, grad_h;
-    Tensor grad_conf_target, grad_conf_notarget, grad_class;
-    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    T obj_mf = CalcMaskPointNum<int>(obj_mask);
-    T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
-    T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
-    CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
-                           obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
-                           noobj_mf);
-    CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
-                           obj_expand_mf);
-
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    AddAllGradToInputGrad<T>(
-        input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
-        grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
-        class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
-        loss_weight_conf_notarget, loss_weight_class);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 9f504d14a8da116648483c0f64cb511b46e6a97e..2ce8f141d3c51661305f4952479cf2889fc4f396 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
@@ -30,6 +31,34 @@ namespace platform {
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
 
+inline static int RoundToPowerOfTwo(int dim) {
+  if (dim > 512) {
+    return 1024;
+  } else if (dim > 256) {
+    return 512;
+  } else if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)         \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta, int width = 32) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 8f80a2d7822f1dc16cee2514a991b7341f5d1cfd..2493fb71c019f9923012afa4a46cb3e95479f860 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
-        "'Place' is not supported, Please re-compile with WITH_GPU "
-        "option");
+        "Place %s is not supported, Please re-compile with WITH_GPU "
+        "option",
+        place);
   }
   return it->second.get().get();
 }
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index d0619293acf2d2df0d925e969bdeb8e45cda6e2b..a260cda49138580b209e647af459e9392d9f18f1 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -70,6 +70,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_ddot);              \
   __macro(cblas_sasum);             \
   __macro(cblas_dasum);             \
+  __macro(cblas_isamax);            \
+  __macro(cblas_idamax);            \
   __macro(cblas_sscal);             \
   __macro(cblas_dscal);             \
   __macro(vsAdd);                   \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 15413785bab3c0fd77244141e8f1840ca0cc1356..142d38f0609d963ce3ff45c595b8432b0e5edd21 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception {
     }
   }
 
-  template <typename... ARGS>
-  EnforceNotMet(const char* f, int l, ARGS... args) {
-    Init(string::Sprintf(args...), f, l);
+  EnforceNotMet(const std::string& str, const char* f, int l) {
+    Init(str, f, l);
   }
 
   const char* what() const noexcept override { return err_str_.c_str(); }
@@ -142,28 +141,23 @@ struct EOFException : public std::exception {
 
 inline bool is_error(bool stat) { return !stat; }
 
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    bool stat, const Args&... args) {
+inline void throw_on_error(bool stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(string::Sprintf(args...));
+  throw std::runtime_error(msg);
 #else
-  LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << msg;
 #endif
 }
 
 #ifdef PADDLE_WITH_CUDA
 
-inline bool is_error(cudaError_t e) { return UNLIKELY(e); }
+inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
 
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cudaError_t e, const Args&... args) {
+inline void throw_on_error(cudaError_t e, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(e, thrust::cuda_category(),
-                             string::Sprintf(args...));
+  throw thrust::system_error(e, thrust::cuda_category(), msg);
 #else
-  LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << msg;
 #endif
 }
 
@@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) {
   return stat != CURAND_STATUS_SUCCESS;
 }
 
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    curandStatus_t stat, const Args&... args) {
+inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
   throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                             string::Sprintf(args...));
+                             msg);
 #else
-  LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << msg;
 #endif
 }
 
@@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) {
   return stat != CUDNN_STATUS_SUCCESS;
 }
 
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cudnnStatus_t stat, const Args&... args) {
+inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
-                           string::Sprintf(args...));
+  throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg);
 #else
-  LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg;
 #endif
 }
 
@@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) {
   return stat != CUBLAS_STATUS_SUCCESS;
 }
 
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cublasStatus_t stat, const Args&... args) {
+inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
   std::string err;
   if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
     err = "CUBLAS: not initialized, ";
@@ -225,87 +212,45 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     err = "CUBLAS: license error, ";
   }
 #ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(err + string::Sprintf(args...));
+  throw std::runtime_error(err + msg);
 #else
-  LOG(FATAL) << err << string::Sprintf(args...);
+  LOG(FATAL) << err << msg;
 #endif
 }
 
 #if !defined(__APPLE__) && !defined(_WIN32)
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    ncclResult_t stat, const Args&... args) {
-  if (stat == ncclSuccess) {
-    return;
-  } else {
+inline bool is_error(ncclResult_t nccl_result) {
+  return nccl_result != ncclSuccess;
+}
+
+inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
-                             string::Sprintf(args...));
+  throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg);
 #else
-    LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
-               << string::Sprintf(args...);
+  LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg;
 #endif
-  }
 }
 #endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
-template <typename T>
-inline void throw_on_error(T e) {
-  throw_on_error(e, "");
-}
-
-#define PADDLE_THROW(...) \
-  throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
-
-#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
-
-#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
-  ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
-
-#ifdef _WIN32
-#define __PADDLE_THROW_ON_ERROR(COND, ...) \
-  __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)
-#else  // _WIN32
-#define __PADDLE_THROW_ON_ERROR(COND, ...)                                \
-  __PADDLE_THROW_ERROR_I(                                                 \
-      __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
-#endif  // _WIN32
-
-#define __PADDLE_UNARY_COMPARE(COND, ...)                 \
-  do {                                                    \
-    auto __cond = COND;                                   \
-    if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
-      __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__);       \
-    }                                                     \
+#define PADDLE_THROW(...)                  \
+  throw ::paddle::platform::EnforceNotMet( \
+      ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
+
+#define PADDLE_ENFORCE(COND, ...)                                         \
+  do {                                                                    \
+    auto __cond__ = (COND);                                               \
+    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {               \
+      try {                                                               \
+        ::paddle::platform::throw_on_error(                               \
+            __cond__, ::paddle::string::Sprintf(__VA_ARGS__));            \
+      } catch (...) {                                                     \
+        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                                __FILE__, __LINE__);      \
+      }                                                                   \
+    }                                                                     \
   } while (0)
 
-#ifndef REPLACE_ENFORCE_GLOG
-#define __PADDLE_ENFORCE_I(COND, ...)                                   \
-  do {                                                                  \
-    try {                                                               \
-      __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);                        \
-    } catch (...) {                                                     \
-      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                              __FILE__, __LINE__);      \
-    }                                                                   \
-  } while (0)
-
-#else
-#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
-#endif  // REPLACE_ENFORCE_GLOG
-
-#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
-#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
-
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index ca89d91aadb2d3e9005e6dd06cef124428d7e250..400a6d7bfa5912774c4bbb2a5868dd9a471afd00 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 
 #include <algorithm>
+#include <cstdlib>
+#include <string>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {
 
-int GetCUDADeviceCount() {
+static int GetCUDADeviceCountImpl() {
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
+      return 0;
+    }
+  }
+
   int count;
   PADDLE_ENFORCE(
       cudaGetDeviceCount(&count),
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
   return count;
 }
 
+int GetCUDADeviceCount() {
+  static auto dev_cnt = GetCUDADeviceCountImpl();
+  return dev_cnt;
+}
+
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   cudaDeviceProp device_prop;
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
+                 "(%p -> %p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
   PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
+                 "%p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index faac6a12c66378d090b642312df4538aeeb3d8cd..269280d604a13a62046fb7811d34b7c69b61b50f 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     mem_fmt.ndims = axis.size();
     for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
       mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-                                     // regardless physical layout)
+      // regardless physical layout)
     }
     mem_fmt.data_type = mkldnn_f32;
     mem_fmt.format = mkldnn_blocked;
@@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     for (int i = nchw_tz.size() - 1; i >= 0; --i) {
       mem_fmt.layout_desc.blocking.padding_dims[i] =
           nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-                       // layout)
+      // layout)
       mem_fmt.layout_desc.blocking.block_dims[i] = 1;
       mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
       mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 8df8e32098697540f02d488c873f5ae7fb29828e..6ae21ee8294bedc388f837aad3e20a2b9aca98a2 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -64,7 +64,7 @@ class NCCLGroupGuard {
   }
 
   inline ~NCCLGroupGuard() {
-    CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
+    PADDLE_ENFORCE(dynload::ncclGroupEnd());
     NCCLMutex().unlock();
   }
 };
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dbc7843caa0c0a39a32cda6050fa99a3ab4c3e22..31c3bfa43ffec22059a602e9ff09a33188d72c91 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -15,18 +15,38 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace pybind {
 
 // Bind Methods
-void BindTracer(pybind11::module *m) {
+void BindTracer(pybind11::module* m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
              new (&self) imperative::Tracer(root_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CPUPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CUDAPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
 }
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 26247026667158a2f43cdac21bf5600479455e16..39e47be606c07ed216c9fe2ff8fa75552b8b7c76 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -33,7 +33,6 @@ using paddle::PaddlePredictor;
 using paddle::NativeConfig;
 using paddle::NativePaddlePredictor;
 using paddle::AnalysisPredictor;
-using paddle::contrib::AnalysisConfig;
 
 static void BindPaddleDType(py::module *m);
 static void BindPaddleBuf(py::module *m);
@@ -180,8 +179,14 @@ void BindNativePredictor(py::module *m) {
 }
 
 void BindAnalysisConfig(py::module *m) {
-  py::class_<AnalysisConfig>(*m, "AnalysisConfig")
-      .def(py::init<const AnalysisConfig &>())
+  py::class_<AnalysisConfig> analysis_config(*m, "AnalysisConfig");
+
+  py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
+      .value("Float32", AnalysisConfig::Precision::kFloat32)
+      .value("Int8", AnalysisConfig::Precision::kInt8)
+      .export_values();
+
+  analysis_config.def(py::init<const AnalysisConfig &>())
       .def(py::init<const std::string &>())
       .def(py::init<const std::string &, const std::string &>())
       .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
@@ -215,7 +220,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("specify_input_name", &AnalysisConfig::specify_input_name)
       .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
            py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
-           py::arg("min_subgraph_size") = 3)
+           py::arg("min_subgraph_size") = 3,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
            py::arg("x") = true)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e63a3b687153b98f3a7735c6e3d9be5136c278c6..adbe1dcf4db95ff058ee850d46fdec74ce9f448e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
       .def("_grad_ivar",
            [](const imperative::VarBase &self) { return self.grads_; },
            py::return_value_policy::reference)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CPUPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
       .def("value", [](const imperative::VarBase &self) { return self.var_; },
            py::return_value_policy::reference)
       .def_property(
@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
       .def("reset", &framework::ReaderHolder::ResetAll);
 
   using LoDTensorBlockingQueue =
@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
       .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
 
   m.def("init_lod_tensor_blocking_queue",
-        [](Variable &var, size_t capacity,
-           const std::vector<std::vector<int64_t>> &shapes)
-            -> std::shared_ptr<LoDTensorBlockingQueue> {
-              std::vector<DDim> dims(shapes.size());
-              std::transform(shapes.begin(), shapes.end(), dims.begin(),
-                             [](const std::vector<int64_t> &shape) {
-                               return make_ddim(shape);
-                             });
-              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims,
-                               FLAGS_reader_queue_speed_test_mode);
-              return holder->GetQueue();
-            },
+        [](Variable &var,
+           size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
+          auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return holder->GetQueue();
+        },
         py::return_value_policy::copy);
 
   py::class_<Scope>(m, "_Scope", R"DOC(
@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace")
-      .def(py::init<int>())
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_CUDA
+             PADDLE_ENFORCE(
+                 dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(),
+                 "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id,
+                 platform::GetCUDADeviceCount());
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
-      .def(py::init<>())
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
@@ -1002,7 +1028,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
+          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
       .def_property(
           "num_trainers",
           [](const BuildStrategy &self) { return self.num_trainers_; },
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 0b94b60018aac3a61edfda4d7ecb762e9fe70673..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
   tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
 }
 
+inline std::string Sprintf() { return ""; }
+
 template <typename... Args>
 std::string Sprintf(const Args&... args) {
   std::ostringstream oss;
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index cda04451f5e6a9a27c9352626f3f08e886fca04b..1135caf4f8c32901d93270d372fdaac702acf006 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -164,16 +164,18 @@ function cmake_gen() {
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
 
+    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
+    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
         ${PYTHON_FLAGS}
         -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -194,7 +196,8 @@ function cmake_gen() {
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} 
+        -DWITH_GRPC=${grpc_flag}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -204,10 +207,9 @@ EOF
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
         ${PYTHON_FLAGS} \
         -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -227,7 +229,8 @@ EOF
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} \
+        -DWITH_GRPC=${grpc_flag}
 
 }
 
@@ -311,6 +314,46 @@ EOF
     fi
 }
 
+function run_brpc_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [[ ${WITH_TESTING:-ON} == "ON" \
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
+        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+    cat <<EOF
+    ========================================
+    Running brpc unit tests ...
+    ========================================
+EOF
+        set +x
+        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
+        "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
+        all_tests=`ctest -N`
+
+        for t in "${other_tests[@]}"
+        do
+            if [[ ${all_tests} != *$t* ]]; then
+                continue
+            fi
+
+            if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+                ctest -V -R $t
+            else
+                ctest --output-on-failure -R $t
+            fi
+        done
+        set -x
+
+        if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+            ctest -V -R test_dist_*
+        else
+            ctest --output-on-failure -R test_dist_*
+        fi
+    fi
+}
+
+
+
 function run_mac_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -483,31 +526,6 @@ function bind_test() {
     wait
 }
 
-
-function gen_docs() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build
-    ========================================
-EOF
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DWITH_DOC=ON \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF
-
-    make -j `nproc` paddle_docs paddle_apis
-
-    # check websites for broken links
-    linkchecker doc/v2/en/html/index.html
-    linkchecker doc/v2/cn/html/index.html
-    linkchecker doc/v2/api/en/html/index.html
-
-}
-
 function gen_doc_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -519,7 +537,6 @@ function gen_doc_lib() {
 EOF
     cmake .. \
         -DCMAKE_BUILD_TYPE=Release \
-        -DWITH_DOC=ON \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
         -DWITH_FLUID_ONLY=ON
@@ -758,9 +775,6 @@ function main() {
       bind_test)
         bind_test
         ;;
-      doc)
-        gen_docs
-        ;;
       gen_doc_lib)
         gen_doc_lib $2
         ;;
@@ -788,6 +802,11 @@ function main() {
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      cicheck_brpc)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_brpc_test
+        ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
         assert_api_spec_approvals
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 24621110b18f63779da14edc42765aae3bf4abd6..870c57e54011361caae5265201d19f58830a87bc 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,10 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import int8_inference
+from .int8_inference import *
+from . import reader
+from .reader import *
 from . import slim
 from .slim import *
 from . import utils
@@ -32,5 +36,7 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += int8_inference.__all__
+__all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9691dad4494f5eacf427b2806b2393baa57dc1e
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -0,0 +1,72 @@
+# Offline INT8 Calibration Tool
+
+PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy.
+
+## 0. Prerequisite
+You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
+
+## 1. How to generate INT8 model
+You can refer to the unit test in [test_calibration.py](../tests/test_calibration.py). Basically, there are three steps:
+* Construct calibration object.
+
+```python
+calibrator = int8_utility.Calibrator( # Step 1
+    program=infer_program, # required, FP32 program
+    pretrained_model=model_path, # required, FP32 pretrained model
+    algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence)
+    exe=exe, # required, executor
+    output=int8_model, # required, INT8 model
+    feed_var_names=feed_dict, # required, feed dict
+    fetch_list=fetch_targets) # required, fetch targets
+```
+
+* Call the calibrator.sample_data() after executor run.
+```python
+_, acc1, _ = exe.run(
+    program,
+    feed={feed_dict[0]: image,
+          feed_dict[1]: label},
+    fetch_list=fetch_targets)
+
+calibrator.sample_data() # Step 2
+```
+
+* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50)
+```python
+calibrator.save_int8_model() # Step 3
+```
+
+## 2. How to run INT8 model
+You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32").
+
+```python
+[infer_program, feed_dict,
+    fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+```
+
+## 3. Result
+We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| ------------ | ------------ | ------------ | ------------ | ------------ |
+| ResNet-50  | Small  | 72.00%  | 72.00%  |  0.00% |
+| MobileNet-V1  | Small  | 62.00%  | 62.00%  | 0.00%  |
+| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.17%  | 0.46% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.49%  | 0.29%  |
+
+Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset"). 
+
+Notes:
+* The accuracy measurement requires the model with `label`.
+* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
+
+## 4. How to reproduce the results
+* Small dataset
+```bash
+python python/paddle/fluid/contrib/tests/test_calibration.py
+```
+
+* Full dataset
+```bash
+DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+```
diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45547201d598c809f7dcf3a1a09103ae5de3e4c6
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import utility
+from .utility import *
+
+__all__ = utility.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35d9f2424ccf093f70e75b13e23f6c5ad59e859
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -0,0 +1,735 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+import numpy as np
+import math
+import os
+from paddle.fluid.executor import global_scope
+from paddle.fluid import io
+
+__all__ = ['Calibrator']
+
+
+class Calibrator(object):
+    '''
+    The calibrator class transforms the program and updates the calculated scale into it.
+    This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
+    '''
+    # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
+    non_conv_int8_op_type = ("pool2d")
+    supported_int8_op_type = ("conv2d", "pool2d")
+    const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
+    u8_max = 255
+    s8_max = 127
+
+    def __init__(self, *args, **kwargs):
+        self.program = kwargs['program']
+        self.pretrained_model = kwargs['pretrained_model']
+        self.debug = kwargs['debug'] if 'debug' in kwargs else False
+        self.algo = kwargs['algo']
+        self.output = kwargs['output']
+        self.feed_var_names = kwargs['feed_var_names']
+        self.fetch_list = kwargs['fetch_list']
+        self.exe = kwargs['exe']
+
+        self._conv_input_var_name = []
+        self._conv_output_var_name = []
+        self._pool2d_output_var_name = []
+        self._weights_var_name = []
+        self._residual_input_var_name = []
+        self._int8_output_var_op_index_dict = {}
+        self._conv_op_index = [
+            index for index, value in enumerate(self.program.global_block().ops)
+            if value.type == 'conv2d'
+        ]
+
+        self._var_max_value_map = {}
+        self._var_max_range = {}
+        self._weights_scaling_factor = {}
+        self._u8_output_var = []
+        self._s8_output_var = []
+        self._persistable_vars = []
+        self._sampling_data = {}
+
+        self.__init_analysis()
+        self.__generate_output_program()
+
+    def save_int8_model(self):
+        self.__sampling(self._sampling_data)
+        self.__save_scale()
+        self.__update_program()
+        self.__update_output_program_attr()
+        self.__display_debug()
+        self.__save_offline_model()
+
+    def sample_data(self):
+        '''
+        Sampling the tensor data of variable.
+        '''
+        for i in self.sampling_program.list_vars():
+            if i.name in self.sampling_vars:
+                np_data = np.array(global_scope().find_var(i.name).get_tensor())
+                if i.name not in self._sampling_data:
+                    self._sampling_data[i.name] = []
+                self._sampling_data[i.name].append(np_data)
+
+    def __save_offline_model(self):
+        '''
+        Save the quantized model to the disk.
+        '''
+        io.save_inference_model(self.output, self.feed_var_names,
+                                self.fetch_list, self.exe,
+                                self.sampling_program)
+
+    def __display_debug(self):
+        if self.debug:
+            self.__dot(self._output_program)
+            print(self._output_program)
+
+    def __get_max_range_by_var_name(self, program, var_name):
+        """
+        Check the specified variable was generated from Relu layer or not.
+        If the variable was the output of one of the pool2d/reshape/concat
+        /transpose, we keep trace the ancestor of this variable;
+        If the variable was the output the conv op, we check it's has_relu
+        attr;
+        Otherwise, we return the Calibrator.s8 as default value.
+        Returns:
+            Return Calibrator.u8_max if the variable was generated by Relu,
+            otherwise it will returns Calibrator.s8
+        """
+        search_end_index = -1
+        input_index_name = {}
+        output_index_name = {}
+        ops_type = []
+
+        for index, op in enumerate(program.current_block().ops):
+            ops_type.append(op.type)
+
+            input_index_name[index] = op.input_arg_names
+
+            output_index_name[index] = op.output_arg_names
+            if var_name in op.output_arg_names:
+                search_end_index = index
+
+        # analysis
+        while search_end_index >= 0:
+            if ops_type[search_end_index] == "relu":
+                return Calibrator.u8_max
+
+            input_name = input_index_name[search_end_index][0]
+
+            for i in output_index_name.keys():
+                if input_name in output_index_name[i]:
+                    search_end_index = i
+                    break
+
+            if ops_type[
+                    search_end_index] not in Calibrator.const_sign_op_type and ops_type[
+                        search_end_index] != 'conv2d':
+                return Calibrator.s8_max
+
+            if ops_type[search_end_index] != 'conv2d':
+                continue
+
+            if program.current_block().ops[search_end_index].has_attr(
+                    'fuse_relu') and program.current_block().ops[
+                        search_end_index].attr('fuse_relu'):
+                return Calibrator.u8_max
+            else:
+                return Calibrator.s8_max
+
+        return Calibrator.s8_max
+
+    def __check_op_type_with_specified_var_as_input(self,
+                                                    program,
+                                                    var_name,
+                                                    start_index=0):
+        '''
+        Check whether all the type of ops that use the specified variable as the
+        input.If one of those op is not int8-enabled, return False.
+        '''
+        op_type_list = [
+            op.type for op in program.current_block().ops[start_index:]
+            if var_name in op.input_arg_names
+        ]
+        for i in op_type_list:
+            if not i in Calibrator.supported_int8_op_type:
+                return False
+        return True
+
+    def __check_var_source_dt(self, var_name):
+        '''
+        Check whether the specified variable is the output of int8 conv op or not.
+        If true, return the original op index.
+        If false, return -1
+        '''
+        return self._int8_output_var_op_index_dict[
+            var_name] if var_name in self._int8_output_var_op_index_dict else -1
+
+    def __update_int8_output_var_op_index_dict(self, index, var_name=None):
+        '''
+        Update the int8_output_variable/op_index dictionary
+        '''
+        for k, v in self._int8_output_var_op_index_dict.items():
+            if v >= index:
+                self._int8_output_var_op_index_dict[k] = v + 1
+        if var_name:
+            self._int8_output_var_op_index_dict[var_name] = index
+
+    def __update_program(self):
+        '''
+        Update the program with the quantize/dequantize op insertion.
+        '''
+        quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
+            self._output_program)
+        inserted_op_length = 0
+        calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
+        insert_op_collection = sorted(quantize_index + dequantize_index)
+
+        for index in insert_op_collection:
+            if index in quantize_index:
+                quantize_tmp = self._output_program.current_block().create_var(
+                    name="quantize_{}_tmp".format(index),
+                    dtype=core.VarDesc.VarType.UINT8)
+                original_out_name = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output_names[0]
+                original_out = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output(original_out_name)[0]
+
+                op = self._output_program.current_block()._insert_op(
+                    index=index + inserted_op_length,
+                    type="quantize",
+                    inputs={"Input": original_out},
+                    outputs={"Output": quantize_tmp}, )
+
+                op._set_attr("data_format", "MKLDNNLAYOUT")
+                op._set_attr("use_mkldnn", 1)
+                op._set_attr(
+                    "Scale", self._var_max_range[original_out] /
+                    calc_max_func(self._var_max_value_map[original_out]))
+
+                if self.__get_max_range_by_var_name(
+                        self._output_program,
+                        original_out) == Calibrator.s8_max:
+                    op._set_attr("is_negative_input", 1)
+
+                self.__update_int8_output_var_op_index_dict(
+                    index + inserted_op_length, "quantize_{}_tmp".format(index))
+
+                inserted_op_length += 1
+                for op in self._output_program.current_block().ops[
+                        index + inserted_op_length:]:
+                    for j in op.input_names:
+                        if op.input(j) and op.input(
+                                j
+                        )[0] == original_out and op.type in Calibrator.supported_int8_op_type:
+                            op.desc.set_input(j,
+                                              ["{}".format(quantize_tmp.name)])
+            else:
+                start_index = index + inserted_op_length
+                dequantize_tmp_var = self._output_program.current_block(
+                ).create_var(
+                    name="dequantize_{}_tmp".format(index + 1),
+                    dtype="float32", )
+                original_out_var = None
+
+                for original_input in self._output_program.current_block().ops[
+                        start_index].input_arg_names:
+                    index_res = self.__get_op_index_by_output_var(
+                        self._output_program, original_input)
+                    if index_res != -1:
+                        original_out_var = original_input
+                        break
+
+                if original_out_var:
+                    op = self._output_program.current_block()._insert_op(
+                        index=start_index,
+                        type="dequantize",
+                        inputs={"Input": original_out_var},
+                        outputs={"Output": dequantize_tmp_var})
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+                    op._set_attr("Scale", self._var_max_range[original_out_var]
+                                 / calc_max_func(self._var_max_value_map[
+                                     original_out_var]))
+
+                    for op_index in range(
+                            start_index + 1,
+                            len(self._output_program.current_block().ops)):
+                        if self._output_program.current_block(
+                        ).ops[op_index].type == "conv2d" and self._output_program.current_block(
+                        ).ops[op_index].attr("force_fp32_output"):
+                            continue
+                        else:
+                            for j in self._output_program.current_block().ops[
+                                    op_index].input_names:
+                                if len(self._output_program.current_block().ops[
+                                        op_index].input(j)
+                                       ) and self._output_program.current_block(
+                                       ).ops[op_index].input(j)[
+                                           0] == original_out_var:
+                                    self._output_program.current_block(
+                                    ).ops[op_index].desc.set_input(
+                                        j,
+                                        ["{}".format(dequantize_tmp_var.name)])
+
+                    inserted_op_length += 1
+
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+
+    def __update_output_program_attr(self):
+        for i in self._output_program.list_vars():
+            if i.name in self._persistable_vars:
+                i.persistable = False
+                os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
+
+        for i in self._u8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.UINT8)
+
+        for i in self._s8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.INT8)
+
+    @property
+    def sampling_program(self):
+        return self._output_program
+
+    @property
+    def sampling_vars(self):
+        return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
+
+    def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
+        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+    def __generate_output_program(self):
+        for i in self.program.list_vars():
+            if not i.persistable and i.name in self.sampling_vars:
+                i.persistable = True
+                self._persistable_vars.append(i.name)
+
+        self._output_program = self.program.clone()
+
+    def __save_scale(self):
+        '''
+        Update the convolution scale information.
+        '''
+        func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
+        for i in self._conv_op_index[1:]:
+            weights_var_name = self.program.current_block().ops[i].input(
+                'Filter')[0]
+            input_var_name = self.program.current_block().ops[i].input('Input')[
+                0]
+            output_var_name = self.program.current_block().ops[i].output(
+                'Output')[0]
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_weights", self._weights_scaling_factor[weights_var_name])
+
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_in", self._var_max_range[input_var_name] /
+                func(self._var_max_value_map[input_var_name]))
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_out", self._var_max_range[output_var_name] /
+                func(self._var_max_value_map[output_var_name]))
+            if self._output_program.current_block().ops[i].desc.input(
+                    "ResidualData"):
+                residual_var_name = self._output_program.current_block().ops[
+                    i].desc.input("ResidualData")[0]
+                self._output_program.current_block().ops[i]._set_attr(
+                    "Scale_in_eltwise", self._var_max_range[residual_var_name] /
+                    func(self._var_max_value_map[residual_var_name]))
+
+    def __sampling(self, sampling_data):
+        '''
+        Sampling the variables data range.
+        '''
+        for i in self.program.list_vars():
+            if i.name not in self.sampling_vars:
+                continue
+
+            if i.name in self._weights_var_name:
+                scaling_factor_per_channel = []
+                data = sampling_data[i.name][0]
+                for j in range(data.shape[0]):
+                    var_value = float(np.max(np.abs(data[j])))
+                    if not self._is_close(var_value, 0.0):
+                        scaling_factor_per_channel.append(Calibrator.s8_max /
+                                                          var_value)
+                    else:
+                        scaling_factor_per_channel.append(0.0)
+                self._weights_scaling_factor[
+                    i.name] = scaling_factor_per_channel
+            else:
+                if i.name in self._conv_output_var_name:
+                    op_pos = self.__get_op_index_by_output_var(self.program,
+                                                               i.name)
+                    cur_op = self.program.current_block().ops[op_pos]
+
+                    if cur_op.has_attr('fuse_relu') and cur_op.attr(
+                            'fuse_relu'):
+                        max_range = Calibrator.u8_max
+                        self._u8_output_var.append(i.name)
+                    else:
+                        max_range = Calibrator.s8_max
+                        self._s8_output_var.append(i.name)
+                else:
+                    max_range = self.__get_max_range_by_var_name(self.program,
+                                                                 i.name)
+                max_value = [[np.abs(np_data)]
+                             for np_data in sampling_data[i.name]]
+
+                self._var_max_range[i.name] = max_range
+                self._var_max_value_map[i.name] = max_value
+
+    def __check_force_fp32_attr_by_output_var(self, program, var_name):
+        for op in program.current_block().ops:
+            if op.type == "conv2d" and var_name in op.output_arg_names:
+                return op.attr("force_fp32_output")
+        return False
+
+    def __get_op_index_by_output_var(self, program, var_name, start_index=0):
+        '''
+        Check whether the specified input variable is the output of the
+        conv/pool2d op's output or not.
+
+        Returns:
+            The index if the variable is the output of any conv/pool2d op's
+            output.
+            -1 when the variable is not the output of any conv/pool2d op's 
+            output.
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
+                return index
+        return -1
+
+    def __get_op_index_by_input_var(self, program, var_name, start_index=0):
+        '''
+        Get the op index by specified input variable.
+        Returns:
+            The op index if the variable is the input of this op or -1 if the 
+            variable is not the input of any op. 
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.input_arg_names:
+                return index
+
+        return -1
+
+    def __get_quantize_dequantize_combination(self, program):
+        """
+        Get the quantize/dequantize op index for further inserting.
+        Args:
+            The program desc.
+        Returns:
+            Two lists contains the quantize op and dequantize op index information.
+        """
+        quantize_op_index = []
+        dequantize_op_index = []
+        minimal_conv_count = 2  # there must be two conv ops if not enable the first conv int8.
+        if len(self._conv_op_index) < minimal_conv_count:
+            return [], []
+
+        for index, value in enumerate(self._conv_op_index):
+            if index == 0:
+                quantize_op_index.append(self._conv_op_index[index + 1])
+            elif index == len(self._conv_op_index) - 1:
+                output_var = program.current_block().ops[value].output(
+                    "Output")[0]
+                if self.__check_op_type_with_specified_var_as_input(
+                        program, output_var, index):
+                    dequantize_op_index.append(self._conv_op_index[index] + 2)
+                else:
+                    program.current_block().ops[value]._set_attr(
+                        "force_fp32_output", True)
+
+            elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
+                                                                      1]:
+
+                program.current_block().ops[self._conv_op_index[
+                    index]]._set_attr("force_fp32_output", True)
+
+                for op_index in range(self._conv_op_index[index + 1],
+                                      self._conv_op_index[index], -1):
+                    op_type = program.current_block().ops[op_index].type
+                    op_has_int8_input = False
+                    input_var_name = None
+                    input_length = len(program.current_block().ops[op_index]
+                                       .input_arg_names)
+
+                    for var_name in program.current_block().ops[
+                            op_index].input_arg_names:
+                        if self.__check_var_source_dt(var_name) != -1:
+                            op_has_int8_input = True
+                            input_var_name = var_name
+                            break
+
+                    if op_has_int8_input:
+                        if op_type == "conv2d":
+                            if program.current_block().ops[op_index +
+                                                           1].type == "conv2d":
+                                continue
+                            elif program.current_block(
+                            ).ops[op_index +
+                                  1].type in Calibrator.non_conv_int8_op_type:
+                                dequantize_op_index.append(op_index + 2)
+                                break
+                            else:
+                                program.current_block().ops[op_index]._set_attr(
+                                    "force_fp32_output", True)
+                                continue
+                        elif not self.__check_force_fp32_attr_by_output_var(
+                                program, input_var_name
+                        ) and op_index not in dequantize_op_index:
+                            share_input_flag = True
+                            for input_attr_name in program.current_block().ops[
+                                    op_index].input_names:
+                                input_var_name = program.current_block().ops[
+                                    op_index].input(input_attr_name)[0]
+                                cousin_op_index = self.__get_op_index_by_input_var(
+                                    program, input_var_name)
+                                if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
+                                    share_input_flag = False
+                                    break
+                            if share_input_flag:
+                                dequantize_op_index.append(op_index)
+
+                    elif input_length:
+                        output_is_to_int8_op = False
+                        share_input_flag = True
+                        for var_name in program.current_block().ops[
+                                op_index].input_arg_names:
+                            if not self.__check_op_type_with_specified_var_as_input(
+                                    program, var_name):
+                                share_input_flag = False
+                                break
+
+                        for var_name in program.current_block().ops[
+                                op_index].output_arg_names:
+                            if self.__get_op_index_by_output_var(
+                                    program, var_name, op_index) != -1:
+                                output_is_to_int8_op = True
+                                break
+
+                        if share_input_flag or output_is_to_int8_op:
+                            quantize_op_index.append(op_index)
+
+        return quantize_op_index, dequantize_op_index
+
+    def __init_analysis(self):
+        '''
+        Collect the variable names for sampling.
+        '''
+        start_index = 1  #analysis the conv op detail from second conv op.
+
+        for i in self._conv_op_index[start_index:]:
+            self._weights_var_name.append(self.program.current_block().ops[i]
+                                          .input('Filter')[0])
+            self._conv_input_var_name.append(self.program.current_block().ops[i]
+                                             .input('Input')[0])
+            self._conv_output_var_name.append(self.program.current_block().ops[
+                i].output('Output')[0])
+            self._int8_output_var_op_index_dict[self.program.current_block()
+                                                .ops[i].output('Output')[0]] = i
+            if self.program.current_block().ops[i].desc.input("ResidualData"):
+                self._residual_input_var_name.append(self.program.current_block(
+                ).ops[i].desc.input("ResidualData")[0])
+
+            if self.program.current_block().ops[i + 1].type == "pool2d":
+                self._pool2d_output_var_name.append(self.program.current_block(
+                ).ops[i + 1].output('Out')[0])
+
+    def __expand_quantized_bins(self, quantized_bins, reference_bins):
+        expanded_quantized_bins = [0] * len(reference_bins)
+        num_merged_bins = len(reference_bins) / len(quantized_bins)
+        j_start = 0
+        j_end = num_merged_bins
+        for idx in xrange(len(quantized_bins)):
+            zero_count = reference_bins[j_start:j_end].count(0)
+            num_merged_bins = j_end - j_start
+            if zero_count == num_merged_bins:
+                avg_bin_ele = 0
+            else:
+                avg_bin_ele = quantized_bins[idx] / (
+                    num_merged_bins - zero_count + 0.0)
+            for idx1 in xrange(j_start, j_end):
+                expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
+                                                 else avg_bin_ele)
+            j_start += num_merged_bins
+            j_end += num_merged_bins
+            if (idx + 1) == len(quantized_bins) - 1:
+                j_end = len(reference_bins)
+        return expanded_quantized_bins
+
+    def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
+                       Q_sum):
+        '''
+        Calculate the entropy.
+        '''
+        assert len(reference_distr_P) == len(candidate_distr_Q)
+        tmp_sum1 = 0
+        tmp_sum2 = 0
+        for idx in range(len(reference_distr_P)):
+            p_idx = reference_distr_P[idx]
+            q_idx = candidate_distr_Q[idx]
+            if p_idx == 0:
+                tmp_sum1 += 0
+                tmp_sum2 += 0
+            else:
+                if q_idx == 0:
+                    print("Fatal error!, idx = " + str(idx) +
+                          " qindex = 0! p_idx = " + str(p_idx))
+                tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
+                tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
+        return (tmp_sum1 - tmp_sum2) / P_sum
+
+    # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    def __get_optimal_scaling_factor(self,
+                                     activation_blob,
+                                     num_quantized_bins=255):
+        '''
+        Using the KL-divergenc method to get the more precise scaling factor.
+        '''
+        max_val = np.max(activation_blob)
+        min_val = np.min(activation_blob)
+        if min_val >= 0:
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(min_val, max_val))
+            ending_iter = 2047
+            starting_iter = int(ending_iter * 0.7)
+        else:
+            th = max(abs(max_val), abs(min_val))
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(-th, th))
+            starting_iter = 0
+            ending_iter = 2047
+            if abs(max_val) > abs(min_val):
+                while starting_iter < ending_iter:
+                    if hist[starting_iter] == 0:
+                        starting_iter += 1
+                        continue
+                    else:
+                        break
+                starting_iter += int((ending_iter - starting_iter) * 0.6)
+            else:
+                while ending_iter > 0:
+                    if hist[ending_iter] == 0:
+                        ending_iter -= 1
+                        continue
+                    else:
+                        break
+                starting_iter = int(0.6 * ending_iter)
+        bin_width = hist_edeges[1] - hist_edeges[0]
+        P_sum = len(activation_blob)
+        min_kl_divergence = 0
+        min_kl_index = 0
+        kl_inited = False
+        for i in range(starting_iter, ending_iter + 1):
+            reference_distr_P = hist[0:i].tolist()
+            outliers_count = sum(hist[i:2048])
+            if reference_distr_P[i - 1] == 0:
+                continue
+            reference_distr_P[i - 1] += outliers_count
+            reference_distr_bins = reference_distr_P[:]
+            candidate_distr_Q = hist[0:i].tolist()
+            num_merged_bins = i / num_quantized_bins
+            candidate_distr_Q_quantized = [0] * num_quantized_bins
+            j_start = 0
+            j_end = num_merged_bins
+            for idx in xrange(num_quantized_bins):
+                candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
+                    j_start:j_end])
+                j_start += num_merged_bins
+                j_end += num_merged_bins
+                if (idx + 1) == num_quantized_bins - 1:
+                    j_end = i
+            candidate_distr_Q = self.__expand_quantized_bins(
+                candidate_distr_Q_quantized, reference_distr_bins)
+            Q_sum = sum(candidate_distr_Q)
+            kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
+                                                candidate_distr_Q, Q_sum)
+            if not kl_inited:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+                kl_inited = True
+            elif kl_divergence < min_kl_divergence:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+            else:
+                pass
+        if min_kl_index == 0:
+            while starting_iter > 0:
+                if hist[starting_iter] == 0:
+                    starting_iter -= 1
+                    continue
+                else:
+                    break
+            min_kl_index = starting_iter
+        return (min_kl_index + 0.5) * bin_width
+
+    @staticmethod
+    def __dot(program, output_name="model.dot"):
+        '''
+        Generate the graphiz dot file for debugging.
+        '''
+        dot_graph = ""
+        dot_nodes = []
+        dot_edges = []
+        dot_graph += "digraph pm {\n"
+        for block in program.blocks:
+            ops = list(block.ops)
+            for index, op in enumerate(ops):
+                op_type = op.type
+                op_name = op_type + "_" + op.output_arg_names[0].replace(
+                    ".", "_") + "___" + str(index)
+                for name in op.input_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = name + " -> " + op_name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                    dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
+                    if dot_node not in dot_nodes:
+                        dot_nodes.append(dot_node)
+
+                for name in op.output_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = op_name + " -> " + name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                if op_type in Calibrator.supported_int8_op_type:
+                    if op_type == "conv2d" and op.has_attr(
+                            'force_fp32_output') and op.attr(
+                                "force_fp32_output"):
+                        dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
+                    else:
+                        dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
+                elif op_type in ["quantize", "dequantize"]:
+                    dot_node = op_name + " [shape=box, style=filled, color=gold]"
+                else:
+                    dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
+
+                if dot_node not in dot_nodes:
+                    dot_nodes.append(dot_node)
+
+        for dot_edge in dot_edges:
+            dot_graph += dot_edge + "\n"
+        for dot_node in dot_nodes:
+            dot_graph += dot_node + "\n"
+        dot_graph += "}"
+
+        with open(output_name, 'w') as f:
+            f.write(dot_graph)
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e4b7d1ce3d9664495220d7ccfc6ef6eac0b81c2
--- /dev/null
+++ b/python/paddle/fluid/contrib/reader/README.md
@@ -0,0 +1,15 @@
+## CTR READER
+
+An multi-thread cpp reader that has the same interface with py_reader. It
+uses cpp multi-thread to read file and is much more faster then the Python read
+thread in py_reader.
+
+Currently, it support two types of file:
+ - gzip
+ - plain text file
+
+and two types of data format:
+ - cvs data format is :
+   * label dense_fea,dense_fea sparse_fea,sparse_fea
+ - the svm data format is :
+   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf85ffc166420f117db9576b4d687c96d429e3c
--- /dev/null
+++ b/python/paddle/fluid/contrib/reader/__init__.py
@@ -0,0 +1,19 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import ctr_reader
+
+__all__ = ctr_reader.__all__
diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py
index b8449e8d848670f8262aa01e5654e0e2fc621837..44e8647f8c3f52b0d3c52c7febfe2ef4ef878bd8 100644
--- a/python/paddle/fluid/contrib/reader/ctr_reader.py
+++ b/python/paddle/fluid/contrib/reader/ctr_reader.py
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
     default_startup_program, Variable
 from paddle.fluid.unique_name import generate as unique_name
 
+__all__ = ['ctr_reader']
+
 
 def monkey_patch_reader_methods(reader):
     def __get_reader__():
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
     def reset():
         return __get_reader__().reset()
 
+    def start():
+        return __get_reader__().start()
+
     reader.reset = reset
+    reader.start = start
     reader.stop_gradient = True
     reader.persistable = True
     return reader
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
     return new_var
 
 
-def ctr_reader(feed_data,
-               capacity,
-               thread_num,
-               batch_size,
-               file_list,
-               slots,
-               name=None):
+def ctr_reader(
+        feed_dict,
+        file_type,  # gzip or plain
+        file_format,  # csv or svm
+        dense_slot_index,
+        sparse_slot_index,
+        capacity,
+        thread_num,
+        batch_size,
+        file_list,
+        slots,
+        name=None):
     """
     Create a CTR reader for data feeding in Python
 
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
     Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
 
     Args:
+       feed_dict(list(variable)): a list of data variable.
+       file_type('gzip'|'plain'): the type of the data file
+       file_format('csv'|'svm'): csv data or svm data format.
+        cvs data format is :
+            label dense_fea,dense_fea sparse_fea,sparse_fea
+        the svm data format is :
+            label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+       dense_slot_index(list(int)): the index of dense slots
+       sparse_slot_index(list(int)): the index of sparse slots
        capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       thread_num(list|tuple): List of tuples which declaring data shapes.
-       batch_size(list|tuple): List of strs which declaring data type.
-       file_list(list|tuple): List of ints which declaring data lod_level.
-       slots(bool): Whether use double buffer or not.
-       name(basestring): The prefix Python queue name and Reader name. None will
+       thread_num(int): the thread num to read files by cpp reader.
+       batch_size(int): batch size of data.
+       file_list(list(str)): List of file names that need to read.
+       slots(list(int64)): list of slot id.
+       name(string): The prefix Python queue name and Reader name. None will
             be generated automatically.
 
     Returns:
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
 
     Examples:
 
-        1. The basic usage of :code:`py_reader` is as follows:
+        1. The basic usage of :code:`ctr_reader` is as follows:
+
+     .. code-block:: python
+
+        py_reader = fluid.contrib.ctr_reader.ctr_reader(
+          feed_dict=datas, file_type='plain', file_format='csv',
+          file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
+          capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
+
     """
     if name is None:
         queue_name = unique_name('lod_tensor_blocking_queue')
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
         reader_name = "_".join([name, "reader"])
 
     var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
 
     startup_blk = default_startup_program().current_block()
     reader_var = startup_blk.create_var(name=reader_name)
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
         inputs={'blocking_queue': [queue_name]},
         outputs={'Out': [reader_var]},
         attrs={
+            'use_data_config': False,
             'thread_num': thread_num,
             'batch_size': batch_size,
             'file_list': file_list,
-            'slots': slots,
+            'file_type': file_type,
+            'file_format': file_format,
+            'dense_slot_index': dense_slot_index,
+            'sparse_slot_index': sparse_slot_index,
+            'sparse_slots': slots,
+            'ranks': [],
+            'lod_levels': [],
+            'shape_concat': []
         })
 
+    dtypes = [data.dtype for data in feed_dict]
+    reader_var.desc.set_dtypes(dtypes)
     reader_var.persistable = True
 
     main_prog_reader_var = _copy_reader_var_(
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
 
     main_blk = default_main_program().current_block()
     main_blk.append_op(
-        type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data})
+        type='read',
+        inputs={'Reader': [reader]},
+        attrs={'infer_out': False},
+        outputs={'Out': feed_dict})
 
     return reader
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 79bec8c4ad34d682895250bc29b1fddb3a569bd4..81aee1233d1db756686d1a934b94672dc5c770fe 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -1,6 +1,10 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+if(APPLE OR WIN32 OR NOT WITH_MKL)
+    list(REMOVE_ITEM TEST_OPS test_calibration)
+endif()
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
new file mode 100644
index 0000000000000000000000000000000000000000..424ea245a0f2dff0d437ace386f2e4e0fa6b517d
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -0,0 +1,324 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import numpy as np
+import time
+import sys
+import random
+import paddle
+import paddle.fluid as fluid
+import functools
+import contextlib
+from paddle.dataset.common import download
+from PIL import Image, ImageEnhance
+import math
+import paddle.fluid.contrib.int8_inference.utility as int8_utility
+
+random.seed(0)
+np.random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 1
+BUF_SIZE = 102400
+
+DATA_DIR = 'data/ILSVRC2012'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+# TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+
+    img = resize_short(img, target_size=256)
+    img = crop_image(img, target_size=DATA_DIM, center=True)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    return img, sample[1]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=DATA_DIR):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(full_lines)
+
+            lines = full_lines
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = os.path.join(data_dir, img_path)
+                if not os.path.exists(img_path):
+                    continue
+                yield img_path, int(label)
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def val(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+
+
+class TestCalibrationForResnet50(unittest.TestCase):
+    def setUp(self):
+        self.int8_download = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.int8_download)
+
+        data_urls = []
+        data_md5s = []
+        self.data_cache_folder = ''
+        if os.environ.get('DATASET') == 'full':
+            data_urls.append(
+                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
+            )
+            data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
+            data_urls.append(
+                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
+            )
+            data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
+            self.data_cache_folder = self.download_data(data_urls, data_md5s,
+                                                        "full_data", False)
+        else:
+            data_urls.append(
+                'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
+            )
+            data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
+            self.data_cache_folder = self.download_data(data_urls, data_md5s,
+                                                        "small_data", False)
+
+        # reader/decorator.py requires the relative path to the data folder
+        cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
+                                                   self.data_cache_folder)
+        os.system(cmd)
+
+        self.batch_size = 1
+        self.sample_iterations = 50
+        self.infer_iterations = 50000 if os.environ.get(
+            'DATASET') == 'full' else 50
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        zip_path = ''
+        if os.environ.get('DATASET') == 'full':
+            file_names = []
+            for i in range(0, len(data_urls)):
+                download(data_urls[i], self.int8_download, data_md5s[i])
+                file_names.append(data_urls[i].split('/')[-1])
+
+            zip_path = os.path.join(self.cache_folder,
+                                    'full_imagenet_val.tar.gz')
+            if not os.path.exists(zip_path):
+                cat_command = 'cat'
+                for file_name in file_names:
+                    cat_command += ' ' + os.path.join(self.cache_folder,
+                                                      file_name)
+                cat_command += ' > ' + zip_path
+                os.system(cat_command)
+
+        if os.environ.get('DATASET') != 'full' or is_model:
+            download(data_urls[0], self.int8_download, data_md5s[0])
+            file_name = data_urls[0].split('/')[-1]
+            zip_path = os.path.join(self.cache_folder, file_name)
+
+        print('Data is downloaded at {0}').format(zip_path)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def download_model(self):
+        # resnet50 fp32 data
+        data_urls = [
+            'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        ]
+        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
+                                                     "resnet50_fp32")
+        self.model = "ResNet-50"
+        self.algo = "direct"
+
+    def run_program(self, model_path, generate_int8=False, algo='direct'):
+        image_shape = [3, 224, 224]
+        os.environ['FLAGS_use_mkldnn'] = 'True'
+
+        fluid.memory_optimize(fluid.default_main_program())
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        [infer_program, feed_dict,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(infer_program, fluid.CPUPlace())
+
+        val_reader = paddle.batch(val(), self.batch_size)
+        iterations = self.infer_iterations
+
+        if generate_int8:
+            int8_model = os.path.join(os.getcwd(), "calibration_out")
+            iterations = self.sample_iterations
+
+            if os.path.exists(int8_model):
+                os.system("rm -rf " + int8_model)
+                os.system("mkdir " + int8_model)
+
+            calibrator = int8_utility.Calibrator(
+                program=infer_program,
+                pretrained_model=model_path,
+                algo=algo,
+                exe=exe,
+                output=int8_model,
+                feed_var_names=feed_dict,
+                fetch_list=fetch_targets)
+
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(image_shape) for x in data]).astype("float32")
+            label = np.array([x[1] for x in data]).astype("int64")
+            label = label.reshape([-1, 1])
+            running_program = calibrator.sampling_program.clone(
+            ) if generate_int8 else infer_program.clone()
+            for op in running_program.current_block().ops:
+                if op.has_attr("use_mkldnn"):
+                    op._set_attr("use_mkldnn", True)
+
+            t1 = time.time()
+            _, acc1, _ = exe.run(
+                running_program,
+                feed={feed_dict[0]: image,
+                      feed_dict[1]: label},
+                fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            if generate_int8:
+                calibrator.sample_data()
+
+            test_info.append(np.mean(acc1) * len(data))
+            cnt += len(data)
+
+            if (batch_id + 1) % 100 == 0:
+                print("{0} images,".format(batch_id + 1))
+                sys.stdout.flush()
+
+            if (batch_id + 1) == iterations:
+                break
+
+        if generate_int8:
+            calibrator.save_int8_model()
+
+            print(
+                "Calibration is done and the corresponding files are generated at {}".
+                format(os.path.abspath("calibration_out")))
+        else:
+            throughput = cnt / np.sum(periods)
+            latency = np.average(periods)
+            acc1 = np.sum(test_info) / cnt
+            return (throughput, latency, acc1)
+
+    def test_calibration(self):
+        self.download_model()
+        print("Start FP32 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
+        print("Start INT8 calibration for {0} on {1} images ...").format(
+            self.model, self.sample_iterations)
+        self.run_program(
+            self.model_cache_folder + "/model", True, algo=self.algo)
+        print("Start INT8 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+
+class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
+    def download_model(self):
+        # mobilenetv1 fp32 data
+        data_urls = [
+            'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
+                                                     "mobilenetv1_fp32")
+        self.model = "MobileNet-V1"
+        self.algo = "KL"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 7b70d19de5ca309441bdc1404e6e601af3c5b892..a24e1d13003d5c42fa9b5a9346d81c8de4ba45c4 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -88,8 +88,8 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            else:
-                self._check_shape(arr.shape)
+            #else:
+            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 1a0a69b5c448cbaee751e0df1a5cfe21c873e00d..4ca2c544e41159cc3b8da91c7a83c91d67bac068 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -71,6 +71,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _imperative_tracer_ = None
+_imperative_current_expected_place_ = None
 
 
 def _in_imperative_mode():
@@ -81,6 +82,10 @@ def _imperative_tracer():
     return _imperative_tracer_
 
 
+def _current_expected_place():
+    return _imperative_current_expected_place_
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -384,8 +389,8 @@ class Variable(object):
             self._ivar.stop_gradient = stop_gradient
 
     def _numpy(self):
-        tensor = self._ivar.value().get_tensor()
-        return np.array(tensor)
+        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())
 
     def _backward(self):
         self._ivar._run_backward()
@@ -441,11 +446,16 @@ class Variable(object):
 
     @property
     def _stop_gradient(self):
-        return self._ivar.stop_gradient
+        if _in_imperative_mode():
+            return self._ivar.stop_gradient
+        else:
+            return self.stop_gradient
 
     @_stop_gradient.setter
     def _stop_gradient(self, s):
-        self._ivar.stop_gradient = s
+        if _in_imperative_mode():
+            self._ivar.stop_gradient = s
+        self.stop_gradient = s
 
     @property
     def persistable(self):
@@ -1306,12 +1316,16 @@ class Block(object):
             outputs=kwargs.get("outputs", None),
             attrs=kwargs.get("attrs", None))
         self.ops.append(op)
+
+        # TODO(minqiyang): add stop_gradient support in static mode too.
+        # currently, we only support stop_gradient in imperative mode.
         self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
     def _trace_op(self, op, stop_gradient=False):
         if _in_imperative_mode():
             _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       _imperative_current_expected_place_,
                                        stop_gradient)
 
     def _insert_op(self, index, *args, **kwargs):
@@ -1893,12 +1907,20 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
-        # for distribute
+        # for distribute training
+        # _is_distributed = True if under distributed training
         self._is_distributed = False
+        # _is_chief = True if the trainer is the first one, usually No.0
         self._is_chief = False
-        self._slice_vars_and_attrs = []
+        # _parameters_on_pservers records all the parameters distributed on parameter servers.
+        self._parameters_on_pservers = None
+        # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"]
         self._endpoints = []
+        # if current role is parameter server, the _ps_endpoint is its "ip:port"
+        self._ps_endpoint = None
+        # trainers_endpoints, it is used for distribution.
         self._trainers_endpoints = []
+        # the distributed lookup table names
         self._distributed_lookup_table = None
 
     @property
@@ -2429,8 +2451,9 @@ class Program(object):
                             "Program")
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
-        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._parameters_on_pservers = other._parameters_on_pservers
         self._endpoints = other._endpoints
+        self._ps_endpoint = other._ps_endpoint
         self._distributed_lookup_table = other._distributed_lookup_table
 
     def _copy_data_info_from(self, other):
@@ -2690,5 +2713,18 @@ def _imperative_guard(tracer):
     global _imperative_tracer_
     tmp_trace = _imperative_tracer_
     _imperative_tracer_ = tracer
+
     yield
+
     _imperative_tracer_ = tmp_trace
+
+
+@contextlib.contextmanager
+def _imperative_place_guard(place):
+    global _imperative_current_expected_place_
+    tmp_place = _imperative_current_expected_place_
+    _imperative_current_expected_place_ = place
+
+    yield
+
+    _imperative_current_expected_place_ = tmp_place
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 5d3ebb25a935cea6ec376e6bc044281dcba37337..ff3984b11f42cf9e6ff49c8654c600c065effe1d 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -25,18 +25,28 @@ def enabled():
 
 
 @contextlib.contextmanager
-def guard():
+def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
     tracer = core.Tracer(train.current_block().desc)
+
+    if place is None:
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
-                yield
+                with framework._imperative_place_guard(place):
+                    yield
 
 
 def to_variable(value, block=None):
     if isinstance(value, np.ndarray):
+        assert enabled(), "to_variable could only be called in imperative mode"
+
         if not block:
             block = framework.default_main_program().current_block()
         py_var = framework.Variable(
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
             dtype=value.dtype)
         var = py_var._ivar.value()
         tensor = var.get_tensor()
-        tensor.set(value, core.CPUPlace())
+        tensor.set(value, framework._current_expected_place())
         return py_var
     elif isinstance(value, framework.Variable):
         return value
-    else:
-        raise ValueError("Unsupported type %s" % type(value))
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index f457f56203eb2c1da62f4d8ad8915c322c822e0a..71ff95bdea36967c1fa6b5c94cc7ca305e7a544a 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -15,6 +15,7 @@
 import contextlib
 import sys
 import numpy as np
+import collections
 
 from paddle.fluid import core
 from paddle.fluid import framework
@@ -31,7 +32,23 @@ class Layer(core.Layer):
         self._dtype = dtype
 
     def parameters(self):
-        return []
+        params = []
+        for key in self.__dict__.keys():
+            value = self.__dict__[key]
+            if isinstance(value, framework.Parameter):
+                params.append(value)
+            elif isinstance(value, core.Layer):
+                params.extend(value.parameters())
+            elif isinstance(value, collections.Container):
+                if len(value) == 0:
+                    continue
+                if isinstance(value[0], framework.Parameter):
+                    params.extend(value)
+                elif isinstance(value[0], core.Layer):
+                    for v in value:
+                        params.extend(v.parameters())
+
+        return params
 
     def clear_gradients(self):
         for p in self.parameters():
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 03fbfe76d120e30edbe7f88ca716141d150d3a9c..6c5961cc63d1c140e0a6f33aac054acdbbe8e8e0 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -22,12 +22,7 @@ from . import layers
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-
-__all__ = [
-    'Conv2D',
-    'Pool2D',
-    'FC',
-]
+__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
 
 
 class Conv2D(layers.Layer):
@@ -55,7 +50,8 @@ class Conv2D(layers.Layer):
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name)
+            name=name,
+            act=act)
 
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -141,6 +137,7 @@ class Conv2D(layers.Layer):
             outputs={'Out': [pre_act]},
             attrs={'axis': 1})
 
+        # Currently, we don't support inplace in imperative mode
         return self._helper.append_activation(pre_act)
 
 
@@ -216,6 +213,7 @@ class FC(layers.Layer):
                  act=None,
                  name=None):
         super(FC, self).__init__()
+
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
@@ -241,6 +239,16 @@ class FC(layers.Layer):
             dtype=self._dtype,
             is_bias=False)
 
+        if self._helper.bias_attr:
+            size = list([self._size])
+            self._b = self._helper.create_parameter(
+                attr=self._helper.bias_attr,
+                shape=size,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self._b = None
+
     def forward(self, input):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
@@ -253,28 +261,238 @@ class FC(layers.Layer):
                 "y_num_col_dims": 1
             })
 
-        out = self._helper.create_variable_for_type_inference(self._dtype)
+        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
             type="sum",
             inputs={"X": [tmp]},
-            outputs={"Out": out},
+            outputs={"Out": pre_bias},
             attrs={"use_mkldnn": False})
 
-        bias_attr = self._helper.bias_attr
-        if bias_attr:
-            # add bias
-            size = list(out.shape[1:])
-            if not self._built:
-                self._b = self._helper.create_parameter(
-                    attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-            bias_out = self._helper.create_variable_for_type_inference(
-                dtype=out.dtype)
+        if self._b:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
             self._helper.append_op(
                 type='elementwise_add',
-                inputs={'X': [out],
+                inputs={'X': [pre_bias],
                         'Y': [self._b]},
-                outputs={'Out': [bias_out]},
-                attrs={'axis': 1})
-            out = bias_out
-        # add activation
-        return self._helper.append_activation(out)
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(pre_activation)
+
+
+class BatchNorm(layers.Layer):
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype=core.VarDesc.VarType.FP32,
+                 data_layout='NCHW',
+                 in_place=False,
+                 name=None,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=False,
+                 fuse_with_relu=False,
+                 use_global_stats=False):
+        super(BatchNorm, self).__init__()
+
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            'batch_norm',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name,
+            act=act)
+
+        if dtype == core.VarDesc.VarType.FP16:
+            self._dtype = core.VarDesc.VarType.FP32
+        else:
+            self._dtype = dtype
+
+        param_shape = [num_channels]
+
+        # create parameter
+        self._scale = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+        if use_global_stats and self._helper.param_attr.learning_rate == 0.:
+            self._scale._stop_gradient = True
+
+        self._bias = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
+            self._bias._stop_gradient = True
+
+        self._mean = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean._stop_gradient = True
+
+        self._variance = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance._stop_gradient = True
+
+        self._in_place = in_place
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._is_test = is_test
+        self._fuse_with_relu = fuse_with_relu
+        self._use_global_stats = use_global_stats
+
+    def _build_once(self, input):
+        pass
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        self._helper.append_op(
+            type="batch_norm",
+            inputs={
+                "X": input,
+                "Scale": self._scale,
+                "Bias": self._bias,
+                "Mean": self._mean,
+                "Variance": self._variance
+            },
+            outputs={
+                "Y": batch_norm_out,
+                "MeanOut": mean_out,
+                "VarianceOut": variance_out,
+                "SavedMean": saved_mean,
+                "SavedVariance": saved_variance
+            },
+            attrs={
+                "momentum": self._momentum,
+                "epsilon": self._epsilon,
+                "is_test": self._is_test,
+                "use_mkldnn": False,
+                "fuse_with_relu": self._fuse_with_relu,
+                "use_global_stats": self._use_global_stats
+            })
+
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(batch_norm_out)
+
+
+class Embedding(layers.Layer):
+    """
+    **Embedding Layer**
+
+    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
+    a lookup table. The result of this lookup is the embedding of each ID in the
+    :attr:`input`.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
+
+    Args:
+        size(tuple|list): The shape of the look up table parameter. It should
+            have two elements which indicate the size of the dictionary of
+            embeddings and the size of each embedding vector respectively.
+        is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed(bool): Whether to run lookup table from remote parameter server.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output
+            with zeros whenever lookup encounters it in :attr:`input`. If
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
+            :math:`size[0] + dim`.
+        param_attr(ParamAttr): Parameters for this layer
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.
+
+    Examples:
+        .. code-block:: python
+
+          dict_size = len(dataset.ids)
+          input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          embedding = fluid.imperative.Embedding(size=[dict_size, 16])
+          fc = embedding(input)
+    """
+
+    def __init__(self,
+                 size,
+                 is_sparse=False,
+                 is_distributed=False,
+                 padding_idx=None,
+                 param_attr=None,
+                 dtype='float32'):
+
+        super(Embedding, self).__init__()
+        self._size = size
+        self._is_sparse = is_sparse
+        self._is_distributed = is_distributed
+
+        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+            size[0] + padding_idx)
+
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
+        if self._remote_prefetch:
+            assert self._is_sparse is True and self._is_distributed is False
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper('embedding', param_attr=param_attr)
+        self._w = self._helper.create_parameter(
+            attr=self._param_attr,
+            shape=self._size,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def parameters(self):
+        return [self._w]
+
+    def forward(self, input):
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type='lookup_table',
+            inputs={'Ids': input,
+                    'W': self._w},
+            outputs={'Out': out},
+            attrs={
+                'is_sparse': self._is_sparse,
+                'is_distributed': self._is_distributed,
+                'remote_prefetch': self._remote_prefetch,
+                'padding_idx': self._padding_idx
+            })
+
+        return out
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8a2cd4a9290ab1d2f3e2af2d682994c999d7b931..5be21ff7f7270f6ce950c069f61418c922bcedc5 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -24,7 +24,8 @@ __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
     'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
     'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
-    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
+    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer',
+    'NumpyArrayInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -365,17 +366,40 @@ class TruncatedNormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['truncated_gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="truncated_gaussian_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
             },
             stop_gradient=True)
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
         var.op = op
         return op
 
@@ -683,6 +707,64 @@ class BilinearInitializer(Initializer):
         return op
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+
+    Args:
+        value (numpy): numpy array to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    """
+
+    def __init__(self, value):
+        import numpy
+        assert isinstance(value, numpy.ndarray)
+        super(NumpyArrayInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype)
+        if dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in self._value.flat]
+        elif dtype == VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in self._value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 5:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        op = block._prepend_op(
+            type='assign_value',
+            outputs={'Out': var},
+            attrs={
+                'dtype': dtype,
+                'shape': list(self._value.shape),
+                value_name: values
+            },
+            stop_gradient=True)
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index e74a87fc68db0e126098f7188db4a712dff2612d..6b1d4cc34f3cd40c878740f28618f26d5e89a6bd 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -19,6 +19,7 @@ import errno
 import time
 import shutil
 import six
+from functools import reduce
 
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
@@ -183,8 +184,6 @@ def save_vars(executor,
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name == main_program._distributed_lookup_table:
-                continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
                 save_block.append_op(
@@ -206,16 +205,6 @@ def save_vars(executor,
                 outputs={},
                 attrs={'file_path': os.path.join(dirname, filename)})
 
-        # if there is lookup table, the trainer 0 will notify all pserver to save.
-        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-            attrs = {}
-            attrs['epmap'] = main_program._endpoints
-            attrs['dir'] = lookup_table_filename
-            attrs['lookup_table'] = main_program._distributed_lookup_table
-            save_block.append_op(
-                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-
         executor.run(save_program)
 
 
@@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None):
         filename=filename)
 
 
+def _save_distributed_persistables(executor, dirname, main_program):
+    """
+    save_persistables for distributed training.
+    the method will do things listed below:
+    1.save part of persistable variables on trainer.
+    2.receive "remote prefetch variables" from parameter servers and merge them.
+    3.save "distributed lookup table" on parameter servers.
+    4.receive "optimizer variables" from parameter servers and merge them.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program): The program whose parameters will be
+                            saved. the main_program must be the trainer_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            train_program = t.get_trainer_program()
+            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
+    """
+
+    def __save_remote_params(executor, dirname, remote_params_map):
+        """
+        recive params on pserver through rpc.
+        if the params are be sliced, will concat them to one, then save it.
+        """
+        if not remote_params_map:
+            return
+
+        prog = Program()
+        block = prog.global_block()
+
+        # recv optimize vars from pserver
+        for name, remote_params in remote_params_map.items():
+            origin_var = None
+            is_slice = False
+            slice_vars = [0] * len(remote_params)
+            slice_var_names = [""] * len(remote_params)
+            endpoints = [""] * len(remote_params)
+
+            for idx, optimizer in enumerate(remote_params):
+                origin = optimizer.origin
+                slice = optimizer.slice
+                is_slice = optimizer.is_slice
+                block_id = optimizer.block_id
+                endpoint = optimizer.endpoint
+
+                if idx == 0:
+                    origin_var = block.create_var(
+                        name=origin.name,
+                        type=origin.type,
+                        shape=origin.shape,
+                        dtype=origin.dtype,
+                        persistable=True)
+
+                slice_var = block.create_var(
+                    name="{}.slice.{}".format(slice.name, idx),
+                    type=slice.type,
+                    shape=slice.shape,
+                    dtype=slice.dtype,
+                    persistable=True)
+
+                index = block_id if is_slice else idx
+                slice_vars[index] = slice_var
+                slice_var_names[index] = slice.name
+                endpoints[index] = endpoint
+
+            if is_slice:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": slice_vars},
+                    attrs={
+                        "epmap": endpoints,
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+                block.append_op(
+                    type='concat',
+                    inputs={'X': slice_vars},
+                    outputs={'Out': origin_var},
+                    attrs={})
+            else:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": [origin_var]},
+                    attrs={
+                        "epmap": endpoints[:1],
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+            block.append_op(
+                type='save',
+                inputs={'X': [origin_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(dirname, origin_var.name)})
+            block.append_op(type='delete_var', inputs={'X': slice_vars})
+        executor.run(prog)
+
+    def __save_distributed_lookup_tables(executor, dirname,
+                                         distributed_lookup_table, endpoints):
+        """
+        because the distributed lookup table may too huge to merge and save at one place,
+        it will be saved at parameter server independent respectively.
+
+        the save directory is dirname/"__lookup_table__".
+
+        """
+        prog = Program()
+        block = prog.global_block()
+
+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        attrs = {}
+        attrs['epmap'] = endpoints
+        attrs['dir'] = lookup_table_filename
+        attrs['lookup_table'] = distributed_lookup_table
+        block.append_op(
+            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+        executor.run(prog)
+
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_save_distributed_persistables' just be designed for distributed training."
+        )
+
+    remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes(
+        ["Optimizer", "RemotePrefetch"], groupby=True)
+
+    exclude_var_names = []
+    if remote_params_map:
+        exclude_var_names.extend(remote_params_map.keys())
+
+    if main_program._distributed_lookup_table:
+        if isinstance(main_program._distributed_lookup_table, list):
+            exclude_var_names.extend(main_program._distributed_lookup_table)
+        else:
+            exclude_var_names.append(main_program._distributed_lookup_table)
+
+    local_vars = list(
+        filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
+    save_vars(
+        executor, main_program=main_program, dirname=dirname, vars=local_vars)
+
+    if main_program._is_chief:
+        if remote_params_map:
+            __save_remote_params(executor, dirname, remote_params_map)
+        if main_program._distributed_lookup_table:
+            __save_distributed_lookup_tables(
+                executor, dirname, main_program._distributed_lookup_table,
+                main_program._endpoints)
+
+
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     This function filters out all variables with `persistable==True` from the
@@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _save_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+
+    else:
+        save_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            vars=None,
+            predicate=is_persistable,
+            filename=filename)
 
 
 def load_vars(executor,
@@ -402,17 +577,11 @@ def load_vars(executor,
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
-        load_slice_vars = []
-        for each_var in main_program._slice_vars_and_attrs:
-            load_slice_vars.append(each_var[2].name)
-
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name in load_slice_vars:
-                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -435,10 +604,6 @@ def load_vars(executor,
                 attrs={'file_path': os.path.join(dirname, filename)})
         executor.run(load_prog)
 
-        # load slice vars on pserver, if have it.
-        _load_slice_up_vars(executor, dirname,
-                            main_program._slice_vars_and_attrs)
-
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _load_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+    else:
+        load_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            predicate=is_persistable,
+            filename=filename)
+
+
+def _load_distributed_persistables(executor, dirname, main_program=None):
+    """
+    customized load_persistables for distributed training.
+    it should be used on parameter server,
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The load directory path.
+        main_program(Program): The program whose parameters will be
+                            loaded. the main_program must be the pserver_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            pserver_prog = t.get_pserver_program(...)
+            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
+    """
+
+    def __is_distributed_part_var(varname):
+        trainer_idx = varname.find(".trainer_")
+        block_idx = varname.find(".block")
+        return trainer_idx or block_idx
+
+    def __load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_load_distributed_persistables' just be designed for distributed training."
+        )
+
+    if not main_program._ps_endpoint:
+        raise ValueError(
+            "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile"
+        )
+
+    need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep(
+        main_program._ps_endpoint)
+    __load_persistable_vars(executor, dirname, need_load_vars)
 
 
 def prepend_feed_ops(inference_program,
@@ -795,52 +1082,6 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
-def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
-                                  pserver_endpoints):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    pserver_endpoints=ps_endpoints)
-    """
-
-    pserver_notify_program = Program()
-    pserver_notify_block = pserver_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = pserver_endpoints
-    attrs['dir'] = dirname
-    attrs['lookup_table'] = lookup_table
-
-    pserver_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(pserver_notify_program)
-
-
 def _endpoints_replacement(program, endpoints):
     ENDPOINT_MAP = "epmap"
     for op in program.global_block().ops:
@@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
-    if not slice_vars_and_attrs:
-        return
-
-    load_prog = Program()
-    load_block = load_prog.global_block()
-    need_delete_vars = []
-
-    for var_tuple in slice_vars_and_attrs:
-        orig_var = var_tuple[0]
-        start = var_tuple[1]
-        slice_var = var_tuple[2]
-        end = start + slice_var.shape[0]
-
-        orig_var_name = orig_var.name
-        orig_var.name = "{}.origin".format(orig_var_name)
-
-        clone_orig_var = load_block.create_var(
-            name=orig_var.name,
-            type=orig_var.type,
-            shape=orig_var.shape,
-            dtype=orig_var.dtype,
-            persistable=True)
-
-        clone_slice_var = load_block.create_var(
-            name=slice_var.name,
-            type=slice_var.type,
-            shape=slice_var.shape,
-            dtype=slice_var.dtype,
-            persistable=True)
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, orig_var_name)})
-        load_block.append_op(
-            type="slice",
-            inputs={'Input': clone_orig_var},
-            outputs={'Out': clone_slice_var},
-            attrs={'axes': [0],
-                   'starts': [start],
-                   'ends': [end]})
-        need_delete_vars.append(clone_orig_var)
-
-    load_block.append_op(
-        type='delete_var',
-        inputs={'X': need_delete_vars}, )
-    executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index ea9953f5814207c569e799c2ffa11758e31699aa..a172141b3a0455769dc1ce74d098be057324e047 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -300,6 +300,17 @@ class LayerHelper(object):
             attr.name = unique_name.generate(".".join([self.name, suffix]))
 
         if default_initializer is None and attr.initializer is None:
+            if isinstance(dtype, core.VarDesc.VarType):
+                if dtype != core.VarDesc.VarType.FP32 and \
+                    dtype != core.VarDesc.VarType.FP64:
+                    raise TypeError(
+                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
+                    )
+            else:
+                if not (dtype.startswith("float") or dtype == "double"):
+                    raise TypeError(
+                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
+                    )
             if is_bias:
                 attr._set_default_bias_initializer()
             else:
@@ -435,7 +446,10 @@ class LayerHelper(object):
         act_type = act.pop('type')
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
-        if not core.IsInplace(act_type):
+        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
+        if not imperative_base.enabled() and core.IsInplace(act_type):
+            tmp = input_var
+        else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 8aed97dc59b100d4e37832e0a148d73662742ba0..0602d7a19481fbf0210a7cb4bd15a1033b0e8900 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -44,10 +44,12 @@ __all__ = [
     'roi_perspective_transform',
     'generate_proposal_labels',
     'generate_proposals',
+    'generate_mask_labels',
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
     'yolov3_loss',
+    'multiclass_nms',
 ]
 
 
@@ -261,8 +263,10 @@ def detection_output(loc,
             number is N + 1, N is the batch size. The i-th image has
             `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
             has no detected results. If all images have not detected results,
-            all the elements in LoD are 0, and output tensor only contains one
+            LoD will be set to {1}, and output tensor only contains one
             value, which is -1.
+            (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1}.)
 
     Examples:
         .. code-block:: python
@@ -342,19 +346,107 @@ def box_coder(prior_box,
               target_box,
               code_type="encode_center_size",
               box_normalized=True,
-              name=None):
+              name=None,
+              axis=0):
     """
-    ${comment}
+    **Box Coder Layer**
+
+    Encode/Decode the target bounding box with the priorbox information.
+    
+    The Encoding schema described below:
+
+    .. math::
+
+        ox = (tx - px) / pw / pxv
+
+        oy = (ty - py) / ph / pyv
+
+        ow = \log(\abs(tw / pw)) / pwv 
+
+        oh = \log(\abs(th / ph)) / phv 
+
+    The Decoding schema described below:
+    
+    .. math::
+  
+        ox = (pw * pxv * tx * + px) - tw / 2
+
+        oy = (ph * pyv * ty * + py) - th / 2
+
+        ow = \exp(pwv * tw) * pw + tw / 2
+
+        oh = \exp(phv * th) * ph + th / 2   
+
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
+    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
+
+    During Box Decoding, two modes for broadcast are supported. Say target 
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
+    [M, 4]. Then prior box will broadcast to target box along the 
+    assigned axis. 
 
     Args:
-        prior_box(${prior_box_type}): ${prior_box_comment}
-        prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
-        target_box(${target_box_type}): ${target_box_comment}
-        code_type(${code_type_type}): ${code_type_comment}
-        box_normalized(${box_normalized_type}): ${box_normalized_comment}
+        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape 
+                             [M, 4] holds M boxes, each box is represented as
+                             [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
+                             left top coordinate of the anchor box, if the 
+                             input is image feature map, they are close to 
+                             the origin of the coordinate system. [xmax, ymax]
+                             is the right bottom coordinate of the anchor box.       
+        prior_box_var(Variable|list): prior_box_var supports two types of input. 
+                              One is variable with shape [M, 4] holds M group.
+                              The other one is list consist of 4 elements 
+                              shared by all boxes. 
+        target_box(Variable): This input can be a 2-D LoDTensor with shape 
+                              [N, 4] when code_type is 'encode_center_size'. 
+                              This input also can be a 3-D Tensor with shape 
+                              [N, M, 4] when code_type is 'decode_center_size'. 
+                              Each box is represented as  
+                              [xmin, ymin, xmax, ymax]. This tensor can 
+                              contain LoD information to represent a batch 
+                              of inputs. 
+        code_type(string): The code type used with the target box. It can be
+                           encode_center_size or decode_center_size
+        box_normalized(int): Whether treat the priorbox as a noramlized box.
+                             Set true by default.
+        name(string): The name of box coder.
+        axis(int): Which axis in PriorBox to broadcast for box decode, 
+                   for example, if axis is 0 and TargetBox has shape
+                   [N, M, 4] and PriorBox has shape [M, 4], then PriorBox
+                   will broadcast to [N, M, 4] for decoding. It is only valid
+                   when code type is decode_center_size. Set 0 by default. 
 
     Returns:
-        output_box(${output_box_type}): ${output_box_comment}
+        output_box(Variable): When code_type is 'encode_center_size', the 
+                              output tensor of box_coder_op with shape 
+                              [N, M, 4] representing the result of N target 
+                              boxes encoded with M Prior boxes and variances. 
+                              When code_type is 'decode_center_size', 
+                              N represents the batch size and M represents 
+                              the number of deocded boxes.
+
+    Examples:
+ 
+        .. code-block:: python
+ 
+            prior_box = fluid.layers.data(name='prior_box', 
+                                          shape=[512, 4], 
+                                          dtype='float32',
+                                          append_batch_size=False)
+            target_box = fluid.layers.data(name='target_box',
+                                           shape=[512,81,4],
+                                           dtype='float32',
+                                           append_batch_size=False)
+            output = fluid.layers.box_coder(prior_box=prior_box,
+                                            prior_box_var=[0.1,0.1,0.2,0.2],
+                                            target_box=target_box,
+                                            code_type="decode_center_size",
+                                            box_normalized=False,
+                                            axis=1)
+
     """
     helper = LayerHelper("box_coder", **locals())
 
@@ -365,15 +457,22 @@ def box_coder(prior_box,
         output_box = helper.create_variable(
             name=name, dtype=prior_box.dtype, persistable=False)
 
+    inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+    attrs = {
+        "code_type": code_type,
+        "box_normalized": box_normalized,
+        "axis": axis
+    }
+    if isinstance(prior_box_var, Variable):
+        inputs['PriorBoxVar'] = prior_box_var
+    elif isinstance(prior_box_var, list):
+        attrs['variance'] = prior_box_var
+    else:
+        raise TypeError("Input variance of box_coder must be Variable or lisz")
     helper.append_op(
         type="box_coder",
-        inputs={
-            "PriorBox": prior_box,
-            "PriorBoxVar": prior_box_var,
-            "TargetBox": target_box
-        },
-        attrs={"code_type": code_type,
-               "box_normalized": box_normalized},
+        inputs=inputs,
+        attrs=attrs,
         outputs={"OutputBox": output_box})
     return output_box
 
@@ -409,13 +508,10 @@ def yolov3_loss(x,
                 gtbox,
                 gtlabel,
                 anchors,
+                anchor_mask,
                 class_num,
                 ignore_thresh,
-                loss_weight_xy=None,
-                loss_weight_wh=None,
-                loss_weight_conf_target=None,
-                loss_weight_conf_notarget=None,
-                loss_weight_class=None,
+                downsample_ratio,
                 name=None):
     """
     ${comment}
@@ -427,16 +523,13 @@ def yolov3_loss(x,
                           and x, y, w, h should be relative value of input image.
                           N is the batch number and B is the max box number in 
                           an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
+        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
                             of [N, B].
         anchors (list|tuple): ${anchors_comment}
+        anchor_mask (list|tuple): ${anchor_mask_comment}
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
-        loss_weight_xy (float|None): ${loss_weight_xy_comment}
-        loss_weight_wh (float|None): ${loss_weight_wh_comment}
-        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
-        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
-        loss_weight_class (float|None): ${loss_weight_class_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss
 
     Returns:
@@ -456,9 +549,10 @@ def yolov3_loss(x,
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
         gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
         gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
-                                        anchors=anchors, ignore_thresh=0.5)
+        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+        anchors = [0, 1, 2]
+        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
+                                        ignore_thresh=0.5, downsample_ratio=32)
     """
     helper = LayerHelper('yolov3_loss', **locals())
 
@@ -470,6 +564,8 @@ def yolov3_loss(x,
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
+        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
     if not isinstance(class_num, int):
         raise TypeError("Attr class_num of yolov3_loss must be an integer")
     if not isinstance(ignore_thresh, float):
@@ -482,31 +578,29 @@ def yolov3_loss(x,
         loss = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
 
+    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
+    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
+
     attrs = {
         "anchors": anchors,
+        "anchor_mask": anchor_mask,
         "class_num": class_num,
         "ignore_thresh": ignore_thresh,
+        "downsample_ratio": downsample_ratio,
     }
 
-    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
-        self.attrs['loss_weight_xy'] = loss_weight_xy
-    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
-        self.attrs['loss_weight_wh'] = loss_weight_wh
-    if loss_weight_conf_target is not None and isinstance(
-            loss_weight_conf_target, float):
-        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
-    if loss_weight_conf_notarget is not None and isinstance(
-            loss_weight_conf_notarget, float):
-        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
-    if loss_weight_class is not None and isinstance(loss_weight_class, float):
-        self.attrs['loss_weight_class'] = loss_weight_class
-
     helper.append_op(
         type='yolov3_loss',
-        inputs={"X": x,
-                "GTBox": gtbox,
-                "GTLabel": gtlabel},
-        outputs={'Loss': loss},
+        inputs={
+            "X": x,
+            "GTBox": gtbox,
+            "GTLabel": gtlabel,
+        },
+        outputs={
+            'Loss': loss,
+            'ObjectnessMask': objectness_mask,
+            'GTMatchMask': gt_match_mask
+        },
         attrs=attrs)
     return loss
 
@@ -1659,7 +1753,7 @@ def generate_proposal_labels(rpn_rois,
                              class_nums=None,
                              use_random=True):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate Proposal Labels of Faster-RCNN **
     This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
     to sample foreground boxes and background boxes, and compute loss target.
 
@@ -1740,6 +1834,140 @@ def generate_proposal_labels(rpn_rois,
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
 
 
+def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
+                         labels_int32, num_classes, resolution):
+    """
+    ** Generate Mask Labels for Mask-RCNN **
+
+    This operator can be, for given the RoIs and corresponding labels,
+    to sample foreground RoIs. This mask branch also has
+    a :math: `K \\times M^{2}` dimensional output targets for each foreground
+    RoI, which encodes K binary masks of resolution M x M, one for each of the
+    K classes. This mask targets are used to compute loss of mask branch.
+
+    Please note, the data format of groud-truth segmentation, assumed the
+    segmentations are as follows. The first instance has two gt objects.
+    The second instance has one gt object, this object has two gt segmentations.
+
+        .. code-block:: python
+
+            #[
+            #  [[[229.14, 370.9, 229.14, 370.9, ...]],
+            #   [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance
+            #  [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance
+            #]
+
+            batch_masks = []
+            for semgs in batch_semgs:
+                gt_masks = []
+                for semg in semgs:
+                    gt_segm = []
+                    for polys in semg:
+                        gt_segm.append(np.array(polys).reshape(-1, 2))
+                    gt_masks.append(gt_segm)
+                batch_masks.append(gt_masks)
+            
+            
+            place = fluid.CPUPlace()
+            feeder = fluid.DataFeeder(place=place, feed_list=feeds)
+            feeder.feed(batch_masks)
+
+    Args:
+        im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size,
+            each element is [height, width, scale] of image. Image scale is
+            target_size) / original_size.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total
+            number of ground-truth, each element is a class label.
+        is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes,
+            each element is a flag indicating whether a groundtruth is crowd.
+        gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2],
+            it's LoD level is 3. Usually users do not needs to understand LoD,
+            The users should return correct data format in reader.
+
+
+
+            The LoD[0] represents the gt objects number of
+            each instance. LoD[1] represents the segmentation counts of each
+            objects. LoD[2] represents the polygons number of each segmentation.
+            S the total number of polygons coordinate points. Each element is
+            (x, y) coordinate points.
+        rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total
+            number of RoIs, each element is a bounding box with
+            (xmin, ymin, xmax, ymax) format in the range of original image.
+        labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type
+            of int32. R is the same as it in `rois`. Each element repersents
+            a class label of a RoI.
+        num_classes(int): Class number.
+        resolution(int): Resolution of mask predictions.
+
+    Returns:
+        mask_rois (Variable):  A 2D LoDTensor with shape [P, 4]. P is the total
+            number of sampled RoIs. Each element is a bounding box with
+            [xmin, ymin, xmax, ymax] format in range of orignal image size.
+        mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1],
+            each element repersents the output mask RoI index with regard to
+            to input RoIs.
+        mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M],
+            K is the classes number and M is the resolution of mask predictions.
+            Each element repersents the binary mask targets.
+
+    Examples:
+        .. code-block:: python
+
+          im_info = fluid.layers.data(name="im_info", shape=[3],
+              dtype="float32")
+          gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
+              dtype="float32", lod_level=1)
+          is_crowd = fluid.layers.data(name="is_crowd", shape=[1],
+              dtype="float32", lod_level=1)
+          gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
+              dtype="float32", lod_level=3)
+          # rois, labels_int32 can be the output of
+          # fluid.layers.generate_proposal_labels.
+          mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
+              im_info=im_info,
+              gt_classes=gt_classes,
+              is_crowd=is_crowd,
+              gt_segms=gt_masks,
+              rois=rois,
+              labels_int32=labels_int32,
+              num_classes=81,
+              resolution=14)
+    """
+
+    helper = LayerHelper('generate_mask_labels', **locals())
+
+    mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype)
+    roi_has_mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+
+    helper.append_op(
+        type="generate_mask_labels",
+        inputs={
+            'ImInfo': im_info,
+            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
+            'GtSegms': gt_segms,
+            'Rois': rois,
+            'LabelsInt32': labels_int32
+        },
+        outputs={
+            'MaskRois': mask_rois,
+            'RoiHasMaskInt32': roi_has_mask_int32,
+            'MaskInt32': mask_int32
+        },
+        attrs={'num_classes': num_classes,
+               'resolution': resolution})
+
+    mask_rois.stop_gradient = True
+    roi_has_mask_int32.stop_gradient = True
+    mask_int32.stop_gradient = True
+
+    return mask_rois, roi_has_mask_int32, mask_int32
+
+
 def generate_proposals(scores,
                        bbox_deltas,
                        im_info,
@@ -1754,33 +1982,48 @@ def generate_proposals(scores,
     """
     **Generate proposal Faster-RCNN**
 
-    This operation proposes RoIs according to each box with their probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
-    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
     2. Calculate box locations as proposals candidates. 
     3. Clip boxes to image
     4. Remove predicted boxes with small area. 
     5. Apply NMS to get final proposals as output.
 
     Args:
-        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and width of the feature map.
-        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the differece between predicted box locatoin and
+            anchor location.
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
+            image information for N batch. Info contains height, width and scale
             between origin image size and the size of feature map.
-        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. 1000 by default.
         nms_thresh(float): Threshold in NMS, 0.5 by default.
-        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
-
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
+            adaptive_threshold = adaptive_threshold * eta in each iteration.
     """
     helper = LayerHelper('generate_proposals', **locals())
 
@@ -1810,3 +2053,119 @@ def generate_proposals(scores,
     rpn_roi_probs.stop_gradient = True
 
     return rpn_rois, rpn_roi_probs
+
+
+def multiclass_nms(bboxes,
+                   scores,
+                   score_threshold,
+                   nms_top_k,
+                   keep_top_k,
+                   nms_threshold=0.3,
+                   normalized=True,
+                   nms_eta=1.,
+                   background_label=0,
+                   name=None):
+    """
+    **Multiclass NMS**
+    
+    This operator is to do multi-class non maximum suppression (NMS) on
+    boxes and scores.
+
+    In the NMS step, this operator greedily selects a subset of detection bounding
+    boxes that have high scores larger than score_threshold, if providing this
+    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+    is larger than -1. Then this operator pruns away boxes that have high IOU
+    (intersection over union) overlap with already selected boxes by adaptive
+    threshold NMS based on parameters of nms_threshold and nms_eta.
+
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+
+    Args:
+        bboxes (Variable): Two types of bboxes are supported:
+                           1. (Tensor) A 3-D Tensor with shape
+                           [N, M, 4 or 8 16 24 32] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is 
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
+                           M is the number of bounding boxes, C is the 
+                           class number   
+        scores (Variable): Two types of scores are supported:
+                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is 
+                           number of bounding boxes. For each category there 
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes.
+                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
+                           M is the number of bbox, C is the class number.
+                           In this case, input BBoxes should be the second
+                           case with shape [M, C, 4].
+        background_label (int): The index of background label, the background 
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score. If not provided, 
+                                 consider all boxes.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences aftern the filtering detections based
+                         on score_threshold.
+        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+        nms_eta (float): The threshold to be used in NMS. Default: 1.0
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        normalized (bool): Whether detections are normalized. Default: True
+        name(str): Name of the multiclass nms op. Default: None.
+
+    Returns:
+        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             or A 2-D LoDTensor with shape [No, 10] represents the detections.
+             Each row has 10 values: 
+             [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the 
+             total number of detections. If there is no detected boxes for all
+             images, lod will be set to {1} and Out only contains one value
+             which is -1.
+             (After version 1.3, when no boxes detected, the lod is changed 
+             from {0} to {1}) 
+
+    Examples:
+        .. code-block:: python
+
+            boxes = fluid.layers.data(name='bboxes', shape=[81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = fluid.layers.data(name='scores', shape=[81],
+                                      dtype='float32', lod_level=1)
+            out = fluid.layers.multiclass_nms(bboxes=boxes,
+                                              scores=scores,
+                                              background_label=0,
+                                              score_threshold=0.5,
+                                              nms_top_k=400,
+                                              nms_threshold=0.3,
+                                              keep_top_k=200,
+                                              normalized=False)
+    """
+    helper = LayerHelper('multiclass_nms', **locals())
+
+    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+    helper.append_op(
+        type="multiclass_nms",
+        inputs={'BBoxes': bboxes,
+                'Scores': scores},
+        attrs={
+            'background_label': background_label,
+            'score_threshold': score_threshold,
+            'nms_top_k': nms_top_k,
+            'nms_threshold': nms_threshold,
+            'nms_eta': nms_eta,
+            'keep_top_k': keep_top_k,
+            'nms_eta': nms_eta,
+            'normalized': normalized
+        },
+        outputs={'Out': output})
+    output.stop_gradient = True
+
+    return output
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9a29b2509357c93a684d736cf0d2523970fb5ff1..1762bd3e343e8af6768dd23f8fbc58cd0182d3c9 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -523,7 +523,7 @@ def _py_reader(capacity,
         double_buffer_name = "_".join([name, "double_buffer"])
 
     var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
 
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=reader_name)
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dde05189722fef77e03a1c2d8f3cbae44a3e8245..617704a53138bd081a2ebe318de0c89e8db4aa96 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
         The decayed learning rate
     Examples:
         .. code-block:: python
-        
+
             learning_rate *= local_gw_ratio * sqrt(sumsq(param))
                         / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
     """
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 503c91c27bafa764bec5cc3d22153e91ae3787d9..0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,7 @@ import six
 import os
 import inspect
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
+from ..initializer import Normal, Constant, NumpyArrayInitializer
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -179,6 +179,7 @@ __all__ = [
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
     'lstm',
+    'shuffle_channel',
     'py_func',
     'psroi_pool',
     'teacher_student_sigmoid_loss',
@@ -931,7 +932,7 @@ def dynamic_gru(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
             If it is set to False, no bias will be applied to the update gate,
             reset gate and candidate calculations. If it is set to None or one
@@ -1072,7 +1073,7 @@ def gru_unit(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
             If it is set to False, no bias will be applied to the update gate,
             reset gate and candidate calculations. If it is set to None or one
@@ -2874,7 +2875,7 @@ def batch_norm(input,
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
     # setting stop_gradient=True to reduce computation
     if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        scale.stop_gradient = True
+        bias.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -3875,7 +3876,9 @@ def beam_search(pre_ids,
                 beam_size,
                 end_id,
                 level=0,
-                name=None):
+                is_accumulated=True,
+                name=None,
+                return_parent_idx=False):
     """
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3887,14 +3890,17 @@ def beam_search(pre_ids,
     selects the top-K candidate word ids of current step from :attr:`ids`
     according to their :attr:`scores` for all source sentences, where K is
     :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
-    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
-    the output of beam_search at previous step, they are needed for special use
-    to handle ended candidate translations.
-
-    Note that the :attr:`scores` passed in should be accumulated scores, and
-    length penalty should be done with extra operators before calculating the
-    accumulated scores if needed, also suggest finding top-K before it and
-    using the top-K candidates following.
+    computation cell. If :attr:`ids` is not set, it will be calculated out
+    according to :attr:`scores`. Additionally, :attr:`pre_ids` and
+    :attr:`pre_scores` are the output of beam_search at previous step, they
+    are needed for special use to handle ended candidate translations.
+
+    Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
+    passed in should be accumulated scores. Else, the :attr:`scores` are
+    considered as the straightforward scores and will be transformed to the
+    log field and accumulated the :attr:`pre_scores` in this operator.
+    Length penalty should be done with extra operators before calculating the
+    accumulated scores if needed.
 
     Please see the following demo for a fully beam search usage example:
 
@@ -3924,12 +3930,20 @@ def beam_search(pre_ids,
             describes how these candidates belong to the prefix. The paths
             linking prefixes and selected candidates are organized and reserved
             in lod.
+        is_accumulated(bool, default True): Whether the input :attr:`score` is
+             accumulated scores.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        return_parent_idx(bool): Whether to return an extra Tensor variable 
+                        preserving the selected_ids' parent indice in pre_ids
+                        in output, which can be used to gather cell states at
+                        the next time step.
 
     Returns:
-        Variable: The LodTensor pair containing the selected ids and the \
-            corresponding scores.
+        Variable: The LodTensor tuple containing the selected ids and the \
+            corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
+            an extra Tensor variable preserving the selected_ids' parent indice \
+            is included.
 
     Examples:
         .. code-block:: python
@@ -3952,33 +3966,41 @@ def beam_search(pre_ids,
                 end_id=end_id)
     """
     helper = LayerHelper('beam_search', **locals())
-    score_type = scores.dtype
-    id_type = ids.dtype
+    score_type = pre_scores.dtype
+    id_type = pre_ids.dtype
+
+    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
+    if ids is not None:
+        inputs["ids"] = ids
 
     selected_scores = helper.create_variable_for_type_inference(
         dtype=score_type)
     selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
+    # parent_idx is a tensor used to gather cell states at the next time
+    # step. Though lod in selected_ids can also be used to gather by
+    # sequence_expand, it is not efficient.
+    # gather_op's index input only supports int32 dtype currently
+    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
 
     helper.append_op(
         type='beam_search',
-        inputs={
-            'pre_ids': pre_ids,
-            'pre_scores': pre_scores,
-            'ids': ids,
-            'scores': scores,
-        },
+        inputs=inputs,
         outputs={
             'selected_ids': selected_ids,
             'selected_scores': selected_scores,
+            'parent_idx': parent_idx
         },
         attrs={
             # TODO(ChunweiYan) to assure other value support
             'level': level,
             'beam_size': beam_size,
             'end_id': end_id,
+            'is_accumulated': is_accumulated,
         })
-
-    return selected_ids, selected_scores
+    if return_parent_idx:
+        return selected_ids, selected_scores, parent_idx
+    else:
+        return selected_ids, selected_scores
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
@@ -5146,9 +5168,9 @@ def nce(input,
         littles = []
         for i in range(custom_dist_len):
             normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 1e-4:
+            if normal_prob - 1.0 > 0:
                 bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 1e-4:
+            elif 1.0 - normal_prob > 0:
                 littles.append((i, normal_prob))
             else:
                 alias_probs_[i] = normal_prob
@@ -5164,9 +5186,9 @@ def nce(input,
             alias_probs_[little[0]] = little[1]
             alias_[little[0]] = big_idx
             big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 1e-4:
+            if big_left - 1.0 > 0:
                 bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 1e-4:
+            elif 1.0 - big_left > 0:
                 littles.append((big_idx, big_left))
             else:
                 alias_probs_[big_idx] = big_left
@@ -5181,14 +5203,21 @@ def nce(input,
             alias_probs_[little[0]] = 1.0
             alias_[little[0]] = -1
 
-        probs = assign(input=np.array(custom_dist).astype('float32'))
-        custom_alias = assign(input=np.array(alias_).astype('int32'))
-        custom_alias_probs = assign(
-            input=np.array(alias_probs_).astype('float32'))
-
-        inputs['CustomDistProbs'] = probs
-        inputs['CustomDistAlias'] = custom_alias
-        inputs['CustomDistAliasProbs'] = custom_alias_probs
+        def _init_by_numpy_array(numpy_array):
+            ret = helper.create_parameter(
+                attr=ParamAttr(),
+                shape=numpy_array.shape,
+                dtype=numpy_array.dtype,
+                default_initializer=NumpyArrayInitializer(numpy_array))
+            ret.stop_gradient = True
+            return ret
+
+        inputs['CustomDistProbs'] = _init_by_numpy_array(
+            np.array(custom_dist).astype('float32'))
+        inputs['CustomDistAlias'] = _init_by_numpy_array(
+            np.array(alias_).astype('int32'))
+        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+            np.array(alias_probs_).astype('float32'))
         sampler = 2
     else:
         raise Exception("Unsupported sampler type.")
@@ -5389,7 +5418,7 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            # use append_batch_size=False to avoid prepending extra
+            # use append_batch_size=False to avoid prepending extra
             # batch size in shape
             x = fluid.layers.data(name='x', shape=[5, 10, 15],
                             dtype='float32', append_batch_size=False)
@@ -5849,7 +5878,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+            stop_gradient=True)
         counter.stop_gradient = True
 
     return counter
@@ -5905,7 +5935,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to the reshaped tensor
                    variable.
-        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
+        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
                        operators. If this flag is set :attr:`True`, reuse input
                        :attr:`x` to reshape, which will change the shape of
                        tensor variable :attr:`x` and might cause errors when
@@ -6566,7 +6596,9 @@ def image_resize(input,
                  scale=None,
                  name=None,
                  resample='BILINEAR',
-                 actual_shape=None):
+                 actual_shape=None,
+                 align_corners=True,
+                 align_mode=1):
     """
     **Resize a Batch of Images**
 
@@ -6579,6 +6611,80 @@ def image_resize(input,
 
         'NEAREST' : Nearest neighbor interpolation
 
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+    direction) on input tensor.
+            
+    Bilinear interpolation is an extension of linear interpolation for 
+    interpolating functions of two variables (e.g. H-direction and 
+    W-direction in this op) on a rectilinear 2D grid. The key idea is 
+    to perform linear interpolation first in one direction, and then 
+    again in the other direction.
+
+    Align_corners and align_mode are optinal parameters,the calculation method 
+    of interpolation can be selected by them.
+
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)
+        
+      
+      Nearest neighbor interpolation:
+      
+      if:
+          align_corners = False
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+      else:
+          align_corners = True
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = round(H_{in} * scale_{factor})
+          W_out = round(W_{in} * scale_{factor})
+
+      Bilinear interpolation:
+
+      if:
+          align_corners = False , align_mode = 0
+          
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+          
+          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+      else:
+       
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = H_{in} * scale_{factor}
+          W_out = W_{in} * scale_{factor}
+
+    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+
+    For details of bilinear interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+
+
+
     Args:
         input (Variable): The input tensor of image resize layer,
                           This is a 4-D tensor of the shape
@@ -6608,6 +6714,13 @@ def image_resize(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
+                               input and output tensors are aligned, preserving the values at the 
+                               corner pixels.
+                               Default: True
+        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\' 
+                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for 
+                            src_idx = scale*dst_index .
 
     Returns:
         Variable: The output is a 4-D tensor of the shape
@@ -6620,6 +6733,8 @@ def image_resize(input,
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
+        TypeError: align_corners shoule be a bool value
+        ValueError: align_mode can only be '0' or '1'
 
     Examples:
         .. code-block:: python
@@ -6635,6 +6750,12 @@ def image_resize(input,
             "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
         )
     resample_type = resample_methods[resample]
+
+    if not isinstance(align_corners, bool):
+        raise TypeError("Attr align_corners should be a bool value")
+    if align_mode != 0 and align_mode != 1:
+        raise ValueError("align_mode can only be 0 or 1")
+
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None.")
     helper = LayerHelper('{}_interp'.format(resample_type), **locals())
@@ -6674,9 +6795,13 @@ def image_resize(input,
         type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={"out_h": out_h,
-               "out_w": out_w,
-               "interp_method": resample_type})
+        attrs={
+            "out_h": out_h,
+            "out_w": out_w,
+            "interp_method": resample_type,
+            "align_corners": align_corners,
+            "align_mode": align_mode
+        })
     return out
 
 
@@ -6685,7 +6810,9 @@ def resize_bilinear(input,
                     out_shape=None,
                     scale=None,
                     name=None,
-                    actual_shape=None):
+                    actual_shape=None,
+                    align_corners=True,
+                    align_mode=1):
     """
     Resize input by performing bilinear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
@@ -6700,6 +6827,47 @@ def resize_bilinear(input,
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
+    Align_corners and align_mode are optinal parameters,the calculation 
+    method of interpolation can be selected by them.
+
+
+    Align_corners and align_mode are optinal parameters,the calculation method 
+    of interpolation can be selected by them.
+
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)     
+
+    Bilinear interpolation:
+
+      if:
+          align_corners = False , align_mode = 0
+          
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+          
+          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+      else:
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = H_{in} * scale_{factor}
+          W_out = W_{in} * scale_{factor}
+
+
+
     Args:
         input(${x_type}): ${x_comment}.
 
@@ -6723,6 +6891,8 @@ def resize_bilinear(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool): ${align_corners_comment}
+        align_mode(bool): ${align_mode_comment}
 
     Returns:
         ${out_comment}.
@@ -6733,7 +6903,8 @@ def resize_bilinear(input,
             out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
 
-    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
+    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
+                        align_corners, align_mode)
 
 
 @templatedoc(op_type="nearest_interp")
@@ -6741,13 +6912,48 @@ def resize_nearest(input,
                    out_shape=None,
                    scale=None,
                    name=None,
-                   actual_shape=None):
+                   actual_shape=None,
+                   align_corners=True):
     """
     Resize input by performing nearest neighbor interpolation in both the
     3rd dimention(in height direction) and the 4th dimention(in width
     direction) based on given output shape which specified by actual_shape,
     out_shape and scale in priority order.
 
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)
+        
+      
+      Nearest neighbor interpolation:
+      
+      if:
+          align_corners = False
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+      else:
+          align_corners = True
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = round(H_{in} * scale_{factor})
+          W_out = round(W_{in} * scale_{factor})
+
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
@@ -6774,6 +6980,7 @@ def resize_nearest(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool): ${align_corners_comment}
 
     Returns:
         ${out_comment}.
@@ -6784,7 +6991,8 @@ def resize_nearest(input,
             out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
     """
 
-    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
+    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape,
+                        align_corners)
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
@@ -8927,7 +9135,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 def sigmoid_cross_entropy_with_logits(x,
                                       label,
                                       ignore_index=kIgnoreIndex,
-                                      name=None):
+                                      name=None,
+                                      normalize=False):
     """
     ${comment}
 
@@ -8936,9 +9145,25 @@ def sigmoid_cross_entropy_with_logits(x,
         label(${label_type}): ${label_comment}
         ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
+        normalize(bool): If true, divide the output by the number of
+            targets != ignore_index.
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            label = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=input,
+                label=label,
+                ignore_index=-1,
+                normalize=True) # or False
+            # loss = fluid.layers.reduce_sum(loss) # summation of loss
     """
 
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
@@ -8953,7 +9178,8 @@ def sigmoid_cross_entropy_with_logits(x,
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={"ignore_index": ignore_index},
+        attrs={"ignore_index": ignore_index,
+               'normalize': normalize},
         outputs={"Out": out})
     return out
 
@@ -9450,7 +9676,7 @@ def teacher_student_sigmoid_loss(input,
                                 by the previous operator.
         label (Variable|list):  the ground truth which is a 2-D tensor with
                                 shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
         soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
 
     Returns:
@@ -9614,6 +9840,79 @@ def get_tensor_from_selected_rows(x, name=None):
     return out
 
 
+def shuffle_channel(x, group, name=None):
+    """
+    **Shuffle Channel Operator**
+
+    This operator shuffles the channels of input x.
+    It divide the input channels in each group into :attr:`group` subgroups,
+    and obtain a new order by selecting element from every subgroup one by one.
+
+    Please refer to the paper
+    https://arxiv.org/pdf/1707.01083.pdf
+    
+    .. code-block:: text
+
+        Given a 4-D tensor input with the shape (N, C, H, W):
+            input.shape = (1, 4, 2, 2)
+            input.data =[[[[0.1, 0.2],
+                           [0.2, 0.3]],
+
+                          [[0.3, 0.4],
+                           [0.4, 0.5]],
+
+                          [[0.5, 0.6],
+                           [0.6, 0.7]],
+
+                          [[0.7, 0.8],
+                           [0.8, 0.9]]]]
+            Given group: 2
+            then we get a 4-D tensor out whth the same shape of input:
+            out.shape = (1, 4, 2, 2)
+            out.data = [[[[0.1, 0.2],
+                          [0.2, 0.3]],
+                          
+                         [[0.5, 0.6],
+                          [0.6, 0.7]],
+                          
+                         [[0.3, 0.4],
+                          [0.4, 0.5]],
+                          
+                         [[0.7, 0.8],
+                          [0.8, 0.9]]]]
+                        
+    Args: 
+        x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
+        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
+
+    Returns:
+        out(Variable): the channels shuffling result is a tensor variable with the 
+        same shape and same type as the input.
+
+    Raises:
+        ValueError: If group is not an int type variable.
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
+            out = fluid.layers.shuffle_channel(x=input, group=2)
+    """
+    helper = LayerHelper("shuffle_channel", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(group, int):
+        raise TypeError("group must be int type")
+
+    helper.append_op(
+        type="shuffle_channel",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"group": group})
+    return out
+
+
 class PyFuncRegistry(object):
     _register_funcs = []
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6c18af7283e19bd431c8d543255d900dc89cba09..3dcf9dc06998be9c38a48f18075cbf99f3dccb1a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -135,7 +135,7 @@ def thresholded_relu(x, threshold=None):
         if val is not None:
             kwargs[name] = val
 
-    _thresholded_relu_(**kwargs)
+    return _thresholded_relu_(**kwargs)
 
 
 thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index ce9f508c9f10981d62b7f8417080f0f3b8d9b8a7..2153ca254f0e286a77160a2d53473e1bc76109d5 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'dtype': out.dtype,
             'value': float(value),
             'force_cpu': force_cpu or force_init_on_cpu()
-        })
+        },
+        stop_gradient=True)
     out.stop_gradient = True
     return out
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index b72b900d3b26cb705b76623e9bacc16c76a7c79f..e0e781a322b3eb68e3f54a66252a8d8b11a9a56f 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -301,10 +301,10 @@ class Optimizer(object):
             no_grad_set (set|None): set of Variables should be ignored.
             callbacks (list|None): list of callables to run when appending backward
                 operator for one parameter.
-        
+
         Return:
             list: list of (param, grad) pair, grad is the output of backward.
-        
+
         Examples:
             See examples in `apply_gradients`.
         """
@@ -322,10 +322,10 @@ class Optimizer(object):
 
         Args:
             params_grads (list): list of (param, grad) pair to do optimization.
-        
+
         Returns:
             list: A list of operators appended to the current program.
-        
+
         Examples:
             .. code-block:: python
 
@@ -364,7 +364,7 @@ class Optimizer(object):
 
         This method combines interface `backward()` and
         `apply_gradients()` into one.
-        
+
         Args:
             loss (Variable): loss variable to run optimizations.
             startup_program (Program): startup_program for initializing parameters
@@ -381,18 +381,21 @@ class Optimizer(object):
         optimize_ops = []
         if imperative_base.enabled():
             if parameter_list is not None:
-                params_grads = parameter_list
+                parameters = parameter_list
             else:
                 parameters = program.global_block().all_parameters()
-                params_grads = []
-                for param in parameters:
-                    # create gradient variable
-                    grad_var = Variable(
-                        block=loss.block,
-                        name=param._ivar._grad_name(),
-                        stop_gradient=True,
-                        ivar=param._ivar._grad_ivar())
-                    params_grads.append((param, grad_var))
+
+            params_grads = []
+            for param in parameters:
+                if param.stop_gradient or not param.trainable:
+                    continue
+                # create gradient variable
+                grad_var = Variable(
+                    block=loss.block,
+                    name=param._ivar._grad_name(),
+                    stop_gradient=True,
+                    ivar=param._ivar._grad_ivar())
+                params_grads.append((param, grad_var))
             with program_guard(program, startup_program):
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a1b1d2f584c399b790580757dea746d7b4e4ac80..a07ff6ac69ca20c8c68659a67606076ce8cdf027 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
         trainers_endpoints = main._trainers_endpoints
         if num_trainers > 1 and trainers_endpoints:
             assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
+                trainers_endpoints), "num_trainers == len(endpoints)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
         # step6: get persistable_vars, places. persistable_vars
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index d99eaa0634f93dcd16dd80ae172f11e8090a2623..77dfa1cb519db3faa9ef8b7b27f7a39b5d31f2a8 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -50,6 +50,19 @@ class TestDetection(unittest.TestCase):
             self.assertEqual(out.shape[-1], 6)
         print(str(program))
 
+    def test_box_coder_api(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
+            bcoder = layers.box_coder(
+                prior_box=x,
+                prior_box_var=[0.1, 0.2, 0.1, 0.2],
+                target_box=y,
+                code_type='encode_center_size')
+            self.assertIsNotNone(bcoder)
+        print(str(program))
+
     def test_detection_api(self):
         program = Program()
         with program_guard(program):
@@ -203,7 +216,7 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 lod_level=1,
                 append_batch_size=False)
             class_nums = 5
-            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+            outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
                 gt_classes=gt_classes,
                 is_crowd=is_crowd,
@@ -216,6 +229,11 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 bg_thresh_lo=0.0,
                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
                 class_nums=class_nums)
+            rois = outs[0]
+            labels_int32 = outs[1]
+            bbox_targets = outs[2]
+            bbox_inside_weights = outs[3]
+            bbox_outside_weights = outs[4]
             assert rois.shape[1] == 4
             assert rois.shape[0] == labels_int32.shape[0]
             assert rois.shape[0] == bbox_targets.shape[0]
@@ -226,6 +244,62 @@ class TestGenerateProposalLabels(unittest.TestCase):
             assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
+class TestGenerateMaskLabels(unittest.TestCase):
+    def test_generate_mask_labels(self):
+        program = Program()
+        with program_guard(program):
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_segms = layers.data(
+                name='gt_segms',
+                shape=[20, 2],
+                dtype='float32',
+                lod_level=3,
+                append_batch_size=False)
+            rois = layers.data(
+                name='rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            labels_int32 = layers.data(
+                name='labels_int32',
+                shape=[4, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            num_classes = 5
+            resolution = 14
+            outs = fluid.layers.generate_mask_labels(
+                im_info=im_info,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_segms=gt_segms,
+                rois=rois,
+                labels_int32=labels_int32,
+                num_classes=num_classes,
+                resolution=resolution)
+            mask_rois, roi_has_mask_int32, mask_int32 = outs
+            assert mask_rois.shape[1] == 4
+            assert mask_int32.shape[1] == num_classes * resolution * resolution
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
@@ -313,7 +387,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
             is_crowd = layers.data(
                 name='is_crowd',
-                shape=[10],
+                shape=[1, 10],
                 dtype='int32',
                 lod_level=1,
                 append_batch_size=False)
@@ -323,7 +397,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
+            outs = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -337,6 +411,11 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3,
                 use_random=False)
+            pred_scores = outs[0]
+            pred_loc = outs[1]
+            tgt_lbl = outs[2]
+            tgt_bbox = outs[3]
+            bbox_inside_weight = outs[4]
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
@@ -351,41 +430,43 @@ class TestRpnTargetAssign(unittest.TestCase):
 
 class TestGenerateProposals(unittest.TestCase):
     def test_generate_proposals(self):
-        data_shape = [20, 64, 64]
-        images = fluid.layers.data(
-            name='images', shape=data_shape, dtype='float32')
-        im_info = fluid.layers.data(
-            name='im_info', shape=[1, 3], dtype='float32')
-        anchors, variances = fluid.layers.anchor_generator(
-            name='anchor_generator',
-            input=images,
-            anchor_sizes=[32, 64],
-            aspect_ratios=[1.0],
-            variance=[0.1, 0.1, 0.2, 0.2],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        num_anchors = anchors.shape[2]
-        scores = fluid.layers.data(
-            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
-        bbox_deltas = fluid.layers.data(
-            name='bbox_deltas',
-            shape=[1, num_anchors * 4, 8, 8],
-            dtype='float32')
-        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-            name='generate_proposals',
-            scores=scores,
-            bbox_deltas=bbox_deltas,
-            im_info=im_info,
-            anchors=anchors,
-            variances=variances,
-            pre_nms_top_n=6000,
-            post_nms_top_n=1000,
-            nms_thresh=0.5,
-            min_size=0.1,
-            eta=1.0)
-        self.assertIsNotNone(rpn_rois)
-        self.assertIsNotNone(rpn_roi_probs)
-        print(rpn_rois.shape)
+        program = Program()
+        with program_guard(program):
+            data_shape = [20, 64, 64]
+            images = fluid.layers.data(
+                name='images', shape=data_shape, dtype='float32')
+            im_info = fluid.layers.data(
+                name='im_info', shape=[3], dtype='float32')
+            anchors, variances = fluid.layers.anchor_generator(
+                name='anchor_generator',
+                input=images,
+                anchor_sizes=[32, 64],
+                aspect_ratios=[1.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = anchors.shape[2]
+            scores = fluid.layers.data(
+                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
+            bbox_deltas = fluid.layers.data(
+                name='bbox_deltas',
+                shape=[num_anchors * 4, 8, 8],
+                dtype='float32')
+            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+                name='generate_proposals',
+                scores=scores,
+                bbox_deltas=bbox_deltas,
+                im_info=im_info,
+                anchors=anchors,
+                variances=variances,
+                pre_nms_top_n=6000,
+                post_nms_top_n=1000,
+                nms_thresh=0.5,
+                min_size=0.1,
+                eta=1.0)
+            self.assertIsNotNone(rpn_rois)
+            self.assertIsNotNone(rpn_roi_probs)
+            print(rpn_rois.shape)
 
 
 class TestYoloDetection(unittest.TestCase):
@@ -395,11 +476,22 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
-                                      0.5)
+            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
+                                      [0, 1], 10, 0.7, 32)
 
             self.assertIsNotNone(loss)
 
 
+class TestMulticlassNMS(unittest.TestCase):
+    def test_multiclass_nms(self):
+        program = Program()
+        with program_guard(program):
+            bboxes = layers.data(
+                name='bboxes', shape=[-1, 10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[-1, 10], dtype='float32')
+            output = layers.multiclass_nms(bboxes, scores, 0.3, 400, 200, 0.7)
+            self.assertIsNotNone(output)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 808e1e6aa80744db1289094d7c1bad00002a4c3e..699181d01da862dca72113e6c11630ae5693e41c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,15 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# The MKLDNN tests are skiped when the MKLDNN flag is OFF
-if(NOT WITH_MKLDNN)
-    foreach(src ${TEST_OPS})
-        if(${src} MATCHES ".*_mkldnn_op$")
-            list(REMOVE_ITEM TEST_OPS ${src})
-        endif()
-    endforeach()
-endif(NOT WITH_MKLDNN)
-
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
     list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
@@ -84,6 +75,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
+list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
+list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -91,6 +84,10 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
+py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
+  FLAGS_cudnn_deterministic=1)
+py_test_modules(test_imperative_optimizer MODULES test_imperative_optimizer ENVS
+  FLAGS_cudnn_deterministic=1)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
@@ -117,3 +114,7 @@ endif()
 if (WITH_NGRAPH)
     add_subdirectory(ngraph)
 endif()
+
+if (WITH_MKLDNN)
+    add_subdirectory(mkldnn)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index faec5350424668fca6416e91c3e58174bd4ec877..f0f13a9d49c5b84521aa3e00bdcabe0c494853a7 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
+                                args.trainers, args.sync_mode, False,
+                                args.current_endpoint)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         exe.run(startup_prog)
 
         if need_load and model_dir:
-            self._load_persistable_vars(exe, model_dir, startup_prog)
+            fluid.io.load_persistables(exe, model_dir, pserver_prog)
+
         exe.run(pserver_prog)
 
     def run_trainer(self, args):
@@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         need_save = bool(int(os.getenv("SAVE", "0")))
         model_dir = os.getenv("MODEL_DIR", "")
-
-        if need_save:
-            for _ in six.moves.xrange(RUN_STEP):
-                loss, = exe.run(fetch_list=[avg_cost.name],
-                                feed=feeder.feed(get_data()))
-            if need_save and model_dir:
-                io.save_persistables(startup_exe, model_dir, trainer_prog)
-
-        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        if six.PY2:
-            print(pickle.dumps(np.ravel(var).tolist()))
+        save_mode = os.getenv("SAVE_MODE", "")
+
+        if save_mode == "LOCAL":
+            if need_save:
+                for _ in six.moves.xrange(RUN_STEP):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                if need_save and model_dir:
+                    io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
+            ))
+            if six.PY2:
+                print(pickle.dumps(np.ravel(var).tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+
+        elif save_mode == "DIST":
+            skip_steps = int(os.getenv("SKIP_STEPS"))
+            loss = None
+            if need_save:
+                for idx in six.moves.xrange(8):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
+                        io.save_persistables(startup_exe, model_dir,
+                                             trainer_prog)
+            else:
+                for idx in six.moves.xrange(8):
+                    data = get_data()
+                    if idx <= skip_steps:
+                        continue
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(data))
+            if six.PY2:
+                print(pickle.dumps(loss.tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
         else:
-            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+            raise Exception("save_mode must be LOCAL or DIST")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index fac5e037a46715d146e354825f09ee8ccc4f3d70..09afae6114e2b6cc8bce9b2be3b221ba9825db8c 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer():
-    # SGD optimizer
-    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+def get_optimizer(op="sgd"):
+    if op.upper() == "sgd".upper():
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    elif op.upper() == "adam".upper():
+        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
+    else:
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
     return optimizer
 
 
@@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
         inference_program = fluid.default_main_program().clone()
 
         # Optimization
-        opt = get_optimizer()
+        opt = os.getenv('OPTIMIZER', 'sgd')
+        opt = get_optimizer(opt)
         opt.minimize(avg_cost)
 
         # Reader
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 611d0dd076b827b0f528f2e3a31182cc4939d1f1..ad94a4b21c347c9a2782437948c20d3b3071c679 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -17,9 +17,9 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 from scipy.special import expit
-from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
 
 
 class TestMKLDNNReluDim2(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
index 1286cee8dc1855c1b1695da46ae0b5222c065114..5fce90372d9beda9b04ab68d0a8ac5ef5c124421 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
@@ -19,9 +19,9 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.framework import grad_var_name
-from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
+from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
 
 
 class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
index 0f2130f9049c7ee294444282e59c654551f76603..1a399740692eab8ccea0c984a1a4f2ac984eb045 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
+from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
 
 
 class TestMKLDNNConcatOp(TestConcatOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 5ad376cb08e488e85be6369a91d4e81031e9e9db..100a03cea0f740a615c4a08810d4ad9e8c974d7a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 438d45b84033b697c3210acc44392b93bf436df0..0542eef80070cbf281ee013c28b7092a2dd17eaa 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestMKLDNN(TestConv2dOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index deefdd09abe6b9f9ca362654f21850f598337245..9bcdb7b2a975b648471714ab628caf91b6b6f3a9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
 
 
 class TestMKLDNN(TestConv2dTransposeOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
index f0e1265e142b800587599783367eca2203033bf1..080b74502fbe83e97e88a65866e0d9b66b37033e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
+from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestMKLDNN(TestConv3dOp):
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index 0c5e1abd7c8fb010357998c0ceaebaf21619fda9..9a54f927cbde648bbbb06d043bbc1391ee43c314 100644
--- a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 class TestDeQuantizeOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index d85cc1f856df8eaa73cef318b48a292042488edf..c3a42656b71d09dbc22abf8ce2ddc243b43b422f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -16,8 +16,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_elementwise_add_op import *
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_elementwise_add_op import *
 '''
 Some tests differ from the tests defined in test_elementwise_add_op.py
 because MKLDNN does not support tensors of number of dimensions 3.
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index 536e9a1c58ec4a8b1b5a7c1d3a5fe737b38d24ab..738715dd70181988028adff1c50be3a52199c312 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -15,10 +15,10 @@
 from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-from test_elementwise_mul_op import *
+from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
 
 
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
index 45951a34d6f61a242cb2dc004d6801a6c1c9dd92..84229a5cffbb466ef3c69cd997adacfb21f6aae2 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 def fully_connected_naive(input, weights, bias_data=None):
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
similarity index 90%
rename from python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
index 9777ec390656d3f6166bf9f5de7bbad8b6bd786d..c18bd77bd3e6de08283f3ac3a31c73453f3c9129 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_gaussian_random_op import TestGaussianRandomOp
+from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp
 
 
 class TestMKLDNN(TestGaussianRandomOp):
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index f6bb2ab7a696c40cb61dd5b38ca702b577fe7ea2..a5e6e116a5f1bc1e051ce3cfdac8cd1e5f3ed90e 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_lrn_op import TestLRNOp
+from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp
 
 
 class TestLRNMKLDNNOp(TestLRNOp):
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
index f4495d0bc8198189962d033ec18b8b67f1f47c84..fca906fecc5fe8d25b9251c886398f8df778043f 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
 
 
 class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
similarity index 90%
rename from python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index 7de5fefc148021d4109da2ac9f4b36c93a05a23f..6de43dd46e5d184ec934f2d85e0c87137e9702e0 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
 def create_test_mkldnn_class(parent):
diff --git a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
index 99607928648be437b7f944f86a0c28b99d1775c4..132f7bd039f7797fb0fc332d6f7b8c242af46535 100644
--- a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 class TestQuantizeOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
index 55820f31b81df9f3618d1004f6d21565564efa29..5928047b5171bcf33b024040ce79577b8aa0b53a 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_sum_op import TestSumOp
+from paddle.fluid.tests.unittests.test_sum_op import TestSumOp
 
 
 class TestMKLDNN(TestSumOp):
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
index 0c201b9e4f48df94924a248d820ae2cf73367560..4845eefe367f1ad6a2eb6ffd1f9b0598b1b4fbbd 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_transpose_op import TestTransposeOp
+from paddle.fluid.tests.unittests.test_transpose_op import TestTransposeOp
 
 
 class TestTransposeMKLDNN(TestTransposeOp):
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..95e592e8ec036ad231ed57ddbc706683cb7aa153
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.fluid.tests.unittests.test_pool2d_op import *
+
+
+class TestNGRAPHPool2D_Op(TestPool2D_Op):
+    def init_test_case(self):
+        super(TestNGRAPHPool2D_Op, self).init_test_case()
+
+
+class TestNGRAPHCase1(TestCase1):
+    def init_test_case(self):
+        super(TestNGRAPHCase1, self).init_test_case()
+
+
+class TestNGRAPHCase2(TestCase2):
+    def init_test_case(self):
+        super(TestNGRAPHCase2, self).init_test_case()
+
+
+class TestNGRAPHCase3(TestCase3):
+    def init_pool_type(self):
+        super(TestNGRAPHCase3, self).init_pool_type()
+
+
+class TestNGRAPHCase4(TestCase4):
+    def init_pool_type(self):
+        super(TestNGRAPHCase4, self).init_pool_type()
+
+
+class TestNGRAPHCase5(TestCase5):
+    def init_pool_type(self):
+        super(TestNGRAPHCase5, self).init_pool_type()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index c28dda4b53ce5d394ff11222e5df8d257b4e80da..1d9f4b78f30fefa21c189036c3731e0afe39ea9e 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
         self._create_pre_ids()
         self.scope.var('selected_ids')
         self.scope.var('selected_scores')
+        self.scope.var('parent_idx')
 
     def test_run(self):
         op = Operator(
@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase):
             scores='scores',
             selected_ids='selected_ids',
             selected_scores='selected_scores',
+            parent_idx='parent_idx',
             level=0,
             beam_size=2,
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        parent_idx = self.scope.find_var("parent_idx").get_tensor()
         self.assertTrue(
             np.allclose(
                 np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase):
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
         self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
+        self.assertTrue(
+            np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index c8a7063dc1cd3e5cc7cd3458b51f5e74981aa75c..f60ed1d79ae5778f751d6101fde386ae3a90c0f7 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -20,7 +20,13 @@ from op_test import OpTest
 import paddle.fluid.core as core
 
 
-def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0):
     """bilinear interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
         out_h = out_size[0]
@@ -29,25 +35,45 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
         out_h = actual_shape[0]
         out_w = actual_shape[1]
     batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
     if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    else:
-        ratio_h = 0.0
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-    else:
-        ratio_w = 0.0
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
+
     for i in range(out_h):
-        h = int(ratio_h * i)
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
         hid = 1 if h < in_h - 1 else 0
-        h1lambda = ratio_h * i - h
+        if (align_mode == 0 and not align_corners):
+            h1lambda = ratio_h * (i + 0.5) - 0.5 - h
+        else:
+            h1lambda = ratio_h * i - h
         h2lambda = 1.0 - h1lambda
         for j in range(out_w):
-            w = int(ratio_w * j)
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
             wid = 1 if w < in_w - 1 else 0
-            w1lambda = ratio_w * j - w
+            if (align_mode == 0 and not align_corners):
+                w1lambda = ratio_w * (j + 0.5) - 0.5 - w
+            else:
+                w1lambda = ratio_w * j - w
             w2lambda = 1.0 - w1lambda
 
             out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
@@ -66,7 +92,8 @@ class TestBilinearInterpOp(OpTest):
         input_np = np.random.random(self.input_shape).astype("float32")
 
         output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape)
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -75,7 +102,9 @@ class TestBilinearInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
         }
         self.outputs = {'Out': output_np}
 
@@ -91,6 +120,8 @@ class TestBilinearInterpOp(OpTest):
         self.out_h = 2
         self.out_w = 2
         self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase1(TestBilinearInterpOp):
@@ -99,6 +130,8 @@ class TestBilinearInterpCase1(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase2(TestBilinearInterpOp):
@@ -107,6 +140,8 @@ class TestBilinearInterpCase2(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase3(TestBilinearInterpOp):
@@ -115,6 +150,8 @@ class TestBilinearInterpCase3(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase4(TestBilinearInterpOp):
@@ -124,6 +161,8 @@ class TestBilinearInterpCase4(TestBilinearInterpOp):
         self.out_h = 1
         self.out_w = 1
         self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase5(TestBilinearInterpOp):
@@ -133,6 +172,8 @@ class TestBilinearInterpCase5(TestBilinearInterpOp):
         self.out_h = 12
         self.out_w = 12
         self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase6(TestBilinearInterpOp):
@@ -142,6 +183,8 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.out_h = 64
         self.out_w = 128
         self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
@@ -151,6 +194,8 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
         self.out_h = 64
         self.out_w = 32
         self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpOpUint8(OpTest):
@@ -162,14 +207,17 @@ class TestBilinearInterpOpUint8(OpTest):
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
         output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape)
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
         }
         self.outputs = {'Out': output_np}
 
@@ -181,6 +229,8 @@ class TestBilinearInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
@@ -189,6 +239,8 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
@@ -198,6 +250,26 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
         self.out_h = 5
         self.out_w = 13
         self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 2511c5c22e012babdeb71a71d3546456ea2ceaf3..6156268bf25ada310a3d22242ecff4b9cdf1759a 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -21,80 +21,80 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
-              box_normalized):
-    prior_box_x = (
-        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
-    prior_box_y = (
-        (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
-    prior_box_width = (
-        (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
-    prior_box_height = (
-        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
-    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
-                                          prior_box_var.shape[1])
-    if not box_normalized:
-        prior_box_height = prior_box_height + 1
-        prior_box_width = prior_box_width + 1
-
-    if (code_type == "EncodeCenterSize"):
-        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
-            target_box.shape[0], 1)
-        target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
-            target_box.shape[0], 1)
-        target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
-            target_box.shape[0], 1)
-        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
-            target_box.shape[0], 1)
-        if not box_normalized:
-            target_box_height = target_box_height + 1
-            target_box_width = target_box_width + 1
-
-        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
-                prior_box_var[:,:,0]
-        output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
-                prior_box_var[:,:,1]
-        output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
-                prior_box_var[:,:,2]
-        output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
-                prior_box_var[:,:,3]
-
-    elif (code_type == "DecodeCenterSize"):
-        target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
-                       prior_box_width + prior_box_x
-        target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
-                       prior_box_height + prior_box_y
-        target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
-                           prior_box_width
-        target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
-                            prior_box_height
-
-        output_box[:, :, 0] = target_box_x - target_box_width / 2
-        output_box[:, :, 1] = target_box_y - target_box_height / 2
-        output_box[:, :, 2] = target_box_x + target_box_width / 2
-        output_box[:, :, 3] = target_box_y + target_box_height / 2
-        if not box_normalized:
-            output_box[:, :, 2] = output_box[:, :, 2] - 1
-            output_box[:, :, 3] = output_box[:, :, 3] - 1
-
-
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
-                    box_normalized):
-    n = target_box.shape[0]
-    m = prior_box.shape[0]
+def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+    shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1)
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
+    if pb_v.ndim == 1:
+        tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h
+    else:
+        tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h
+    output_box[:, :, 0] = tb_x - tb_w / 2
+    output_box[:, :, 1] = tb_y - tb_h / 2
+    output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm)
+    output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm)
+
+
+def box_encoder(t_box, p_box, pb_v, output_box, norm):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+    shape = (1, p_box.shape[0])
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
+    tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1)
+    tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1)
+    tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm)
+    tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm)
+    if pb_v.ndim == 1:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3]
+    else:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3]
+
+
+def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
+    n = t_box.shape[0]
+    m = p_box.shape[0]
+    if code_type == "DecodeCenterSize":
+        m = t_box.shape[1]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
     cur_offset = 0
     for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
-                      prior_box, prior_box_var,
-                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      code_type, box_normalized)
+            box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v,
+                        output_box[cur_offset:(cur_offset + lod[i]), :, :],
+                        norm)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      prior_box, prior_box_var,
-                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      code_type, box_normalized)
+            box_decoder(t_box, p_box, pb_v, output_box, norm, axis)
         cur_offset += lod[i]
     return output_box
 
@@ -106,9 +106,35 @@ class TestBoxCoderOp(OpTest):
     def setUp(self):
         self.op_type = "box_coder"
         lod = [[1, 1, 1, 1, 1]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.random.random((10, 4)).astype('float32')
-        target_box = np.random.random((5, 10, 4)).astype('float32')
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.random.random((81, 4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized)
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithOneRankVar(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
         code_type = "DecodeCenterSize"
         box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -133,9 +159,9 @@ class TestBoxCoderOpWithoutBoxVar(OpTest):
     def setUp(self):
         self.op_type = "box_coder"
         lod = [[0, 1, 2, 3, 4, 5]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.ones((10, 4)).astype('float32')
-        target_box = np.random.random((5, 10, 4)).astype('float32')
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.ones((81, 4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
         code_type = "DecodeCenterSize"
         box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -158,10 +184,10 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[4, 8, 8]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.random.random((10, 4)).astype('float32')
-        target_box = np.random.random((20, 4)).astype('float32')
+        lod = [[10, 20, 20]]
+        prior_box = np.random.random((20, 4)).astype('float32')
+        prior_box_var = np.random.random((20, 4)).astype('float32')
+        target_box = np.random.random((50, 4)).astype('float32')
         code_type = "EncodeCenterSize"
         box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -176,5 +202,63 @@ class TestBoxCoderOpWithLoD(OpTest):
         self.outputs = {'OutputBox': output_box}
 
 
+class TestBoxCoderOpWithAxis(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((30, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((30, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        axis = 1
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized, axis)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False,
+            'axis': axis
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithVariance(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((30, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((30, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        axis = 1
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized, axis)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False,
+            'variance': prior_box_var.astype(np.float).flatten(),
+            'axis': axis
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 69a38618cde7f100849a30e1cb1a2f4738e55e0d..0968ace62b6a4e258f7763dbf6fbeda07feb4cd5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -43,7 +43,8 @@ class TestDistRunnerBase(object):
                        pserver_endpoints,
                        trainers,
                        sync_mode,
-                       dc_asgd=False):
+                       dc_asgd=False,
+                       current_endpoint=None):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
@@ -53,7 +54,8 @@ class TestDistRunnerBase(object):
             program=main_program,
             pservers=pserver_endpoints,
             trainers=trainers,
-            sync_mode=sync_mode)
+            sync_mode=sync_mode,
+            current_endpoint=current_endpoint)
         return t
 
     def run_pserver(self, args):
@@ -122,7 +124,7 @@ class TestDistRunnerBase(object):
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
+                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
             mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index 4588ca7c17ba5db893f080813d299feaa47626a7..e795bc410ee45a18cc0c7c914636f5b03309fad1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase):
                          delta=1e-3,
                          check_error_log=False,
                          need_envs={}):
-
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         need_envs = {
             "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'LOCAL',
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        save_env = {}
+        save_env["SAVE_MODE"] = "DIST"
+        save_env["SAVE"] = "1"
+        save_env["MODEL_DIR"] = model_dir
+        save_env.update(required_envs)
+
+        tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
+                                                 check_error_log)
+
+        load_env = {}
+        load_env["LOAD"] = "1"
+        load_env["MODEL_DIR"] = model_dir
+        load_env.update(required_envs)
+        tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
+                                                 check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        train0_1_np = np.array(tr0_var_1)
+        train1_1_np = np.array(tr1_var_1)
+        train0_2_np = np.array(tr0_var_2)
+        train1_2_np = np.array(tr1_var_2)
+
+        self.assertAlmostEqual(
+            train0_1_np.all(), train0_2_np.all(), delta=delta)
+        self.assertAlmostEqual(
+            train1_1_np.all(), train1_2_np.all(), delta=delta)
+
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'DIST',
+            'OPTIMIZER': 'ADAM',
+            'SKIP_STEPS': str(np.random.randint(2, 6))
         }
         self.check_with_place(
             "dist_save_load.py",
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3d1ce6b27c935ddca0f2f5fb377e69b571e3714c..3566fed215229223f4d2ecd1bbb66cb297dd7716 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest):
         pserver, _ = self.get_pserver(self.pserver1_ep)
         pserver2, _ = self.get_pserver(self.pserver2_ep)
 
-        self.assertTrue(pserver._slice_vars_and_attrs)
-        self.assertTrue(pserver2._slice_vars_and_attrs)
-
-        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
-            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
-                             pserver2._slice_vars_and_attrs[idx][0])
-
-            total_numel = six.moves.reduce(
-                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
-            self.assertEqual(
-                total_numel,
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][2].shape) +
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver2._slice_vars_and_attrs[idx][2].shape))
+        vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver1_ep)
+        vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver2_ep)
+
+        self.assertTrue(vars_ps1)
+        self.assertTrue(vars_ps2)
+
+        for idx in six.moves.xrange(len(vars_ps1)):
+            total_numel = 0
+            ps1_numel, ps2_numel = 0, 0
+
+            ps1_var = vars_ps1[idx]
+
+            if not ps1_var.is_slice:
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               vars_ps1[idx].origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             vars_ps1[idx].slice.shape)
+            else:
+                ps2_var = None
+                for var in vars_ps2:
+                    if var.origin.name == ps1_var.origin.name:
+                        ps2_var = var
+                        break
+
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               ps1_var.origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps1_var.slice.shape)
+                ps2_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps2_var.slice.shape)
+
+            self.assertEqual(total_numel, ps1_numel + ps2_numel)
 
 
 class TestNCCL2Transpile(TranspilerTest):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
index 7ec1f0ae753724dac5c4675926ead87a097a7a99..56dfb095def62bc617948821038f0c15c1547683 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -16,12 +16,17 @@ import os
 import unittest
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
 
+# FIXME(zjl): It seems that this unittest fails randomly 
+# when comparing all reduce last loss and reduce last loss
+# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
+# Disable it temporarily.
+'''
 from test_parallel_executor_mnist import TestMNIST
 
 
 class EagerDeletionTestMNIST(TestMNIST):
     pass
-
+'''
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7ce33ea7ca2c53dc2bb2a7048444c818d4f33f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -0,0 +1,421 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+import six
+import paddle.fluid as fluid
+from op_test import OpTest
+'''
+# Equivalent code
+rles = mask_util.frPyObjects([segm], im_h, im_w)
+mask = mask_util.decode(rles)
+'''
+
+
+def decode(cnts, m):
+    v = 0
+    mask = []
+    for j in range(m):
+        for k in range(cnts[j]):
+            mask.append(v)
+        v = 1 - v
+    return mask
+
+
+def poly2mask(xy, k, h, w):
+    scale = 5.
+    x = [int(scale * p + 0.5) for p in xy[::2]]
+    x = x + [x[0]]
+    y = [int(scale * p + 0.5) for p in xy[1::2]]
+    y = y + [y[0]]
+    m = sum([
+        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
+        for j in range(k)
+    ])
+
+    u, v = [], []
+    for j in range(k):
+        xs = x[j]
+        xe = x[j + 1]
+        ys = y[j]
+        ye = y[j + 1]
+        dx = abs(xe - xs)
+        dy = abs(ys - ye)
+        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
+        if flip:
+            xs, xe = xe, xs
+            ys, ye = ye, ys
+
+        if dx >= dy:
+            if (dx == 0): assert ye - ys == 0
+            s = 0 if dx == 0 else float(ye - ys) / dx
+        else:
+            if (dy == 0): assert xe - xs == 0
+            s = 0 if dy == 0 else float(xe - xs) / dy
+
+        if dx >= dy:
+            ts = [dx - d if flip else d for d in range(dx + 1)]
+            u.extend([xs + t for t in ts])
+            v.extend([int(ys + s * t + .5) for t in ts])
+        else:
+            ts = [dy - d if flip else d for d in range(dy + 1)]
+            v.extend([t + ys for t in ts])
+            u.extend([int(xs + s * t + .5) for t in ts])
+
+    k = len(u)
+    x = np.zeros((k), np.int)
+    y = np.zeros((k), np.int)
+    m = 0
+    for j in six.moves.xrange(1, k):
+        if u[j] != u[j - 1]:
+            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
+            xd = (xd + .5) / scale - .5
+            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
+                continue
+            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
+            yd = (yd + .5) / scale - .5
+            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
+            x[m] = int(xd)
+            y[m] = int(yd)
+            m += 1
+    k = m
+    a = [int(x[i] * h + y[i]) for i in range(k)]
+    a.append(h * w)
+    a.sort()
+    b = [0] + a[:len(a) - 1]
+    a = [c - d for (c, d) in zip(a, b)]
+
+    k += 1
+    b = [0 for i in range(k)]
+    b[0] = a[0]
+    m, j = 1, 1
+    while (j < k):
+        if a[j] > 0:
+            b[m] = a[j]
+            m += 1
+            j += 1
+        else:
+            j += 1
+            if (j < k):
+                b[m - 1] += a[j]
+                j += 1
+    mask = decode(b, m)
+    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = mask.transpose((1, 0))
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for i in range(len(polys)):
+        poly = polys[i]
+        x0 = min(min(p[::2]) for p in poly)
+        x1 = max(max(p[::2]) for p in poly)
+        y0 = min(min(p[1::2]) for p in poly)
+        y1 = max(max(p[1::2]) for p in poly)
+        boxes_from_polys[i, :] = [x0, y0, x1, y1]
+    return boxes_from_polys
+
+
+def bbox_overlaps(boxes, query_boxes):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
+                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
+                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
+                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    ua = float(
+                         (boxes[n, 2] - boxes[n, 0] + 1) *\
+                         (boxes[n, 3] - boxes[n, 1] + 1) +\
+                         box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    mask = []
+    for polygons in polygons_norm:
+        assert polygons.shape[0] % 2 == 0
+        k = polygons.shape[0] // 2
+        mask.append(poly2mask(polygons, k, M, M))
+    mask = np.array(mask)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=0)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
+    """Expand masks from shape (#masks, resolution ** 2)
+    to (#masks, #classes * resolution ** 2) to encode class
+    specific mask targets.
+    """
+    assert masks.shape[0] == mask_class_labels.shape[0]
+
+    # Target values of -1 are "don't care" / ignore labels
+    mask_targets = -np.ones(
+        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
+    for i in range(masks.shape[0]):
+        cls = int(mask_class_labels[i])
+        start = resolution**2 * cls
+        end = start + resolution**2
+        # Ignore background instance
+        # (only happens when there is no fg samples in an image)
+        if cls > 0:
+            mask_targets[i, start:end] = masks[i, :]
+    return mask_targets
+
+
+def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd,
+                         label_int32, gt_polys, resolution, rois, roi_lod,
+                         gt_lod):
+    mask_rois = []
+    roi_has_mask_int32 = []
+    mask_int32 = []
+    new_lod = []
+    for i in range(len(im_info)):
+        roi_s = roi_lod[i]
+        roi_e = roi_lod[i + 1]
+        gt_s = gt_lod[i]
+        gt_e = gt_lod[i + 1]
+        mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e],
+                                 is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e],
+                                 gt_polys[i], resolution, rois[roi_s:roi_e])
+        new_lod.append(mask_blob['mask_rois'].shape[0])
+        mask_rois.append(mask_blob['mask_rois'])
+        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
+        mask_int32.append(mask_blob['mask_int32'])
+    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
+
+
+def _sample_mask(
+        num_classes,
+        im_info,
+        gt_classes,
+        is_crowd,
+        label_int32,
+        gt_polys,  # [[[], []], []]
+        resolution,
+        rois):
+    mask_blob = {}
+    im_scale = im_info[2]
+    sample_boxes = rois
+    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
+    polys_gt = [gt_polys[i] for i in polys_gt_inds]
+    boxes_from_polys = polys_to_boxes(polys_gt)
+
+    fg_inds = np.where(label_int32 > 0)[0]
+    roi_has_mask = fg_inds.copy()
+    if fg_inds.shape[0] > 0:
+        mask_class_labels = label_int32[fg_inds]
+        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
+        rois_fg = sample_boxes[fg_inds]
+        overlaps_bbfg_bbpolys = bbox_overlaps(
+            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32))
+        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
+        for i in range(rois_fg.shape[0]):
+            fg_polys_ind = fg_polys_inds[i]
+            poly_gt = polys_gt[fg_polys_ind]
+            roi_fg = rois_fg[i]
+            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
+            mask = np.array(mask > 0, dtype=np.int32)
+            masks[i, :] = np.reshape(mask, resolution**2)
+    else:
+        bg_inds = np.where(label_int32 == 0)[0]
+        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
+        masks = -np.ones((1, resolution**2), dtype=np.int32)
+        mask_class_labels = np.zeros((1, ))
+        roi_has_mask = np.append(roi_has_mask, 0)
+    masks = expand_mask_targets(masks, mask_class_labels, resolution,
+                                num_classes)
+    rois_fg *= im_scale
+    mask_blob['mask_rois'] = rois_fg
+    mask_blob['roi_has_mask_int32'] = roi_has_mask
+    mask_blob['mask_int32'] = masks
+    return mask_blob
+
+
+def trans_lod(lod):
+    new_lod = [0]
+    for i in range(len(lod)):
+        new_lod.append(lod[i] + new_lod[i])
+    return new_lod
+
+
+class TestGenerateMaskLabels(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_generate_proposal_labels_out()
+        self.generate_gt_polys()
+        self.generate_groundtruth()
+        self.init_test_output()
+        self.inputs = {
+            'ImInfo': self.im_info,
+            'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod),
+            'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod),
+            'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod),
+            'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod),
+            'Rois': (self.rois.astype(np.float32), self.rois_lod)
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'resolution': self.resolution
+        }
+        self.outputs = {
+            'MaskRois': (self.mask_rois, [self.new_lod]),
+            'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]),
+            'MaskInt32': (self.mask_int32, [self.new_lod])
+        }
+
+    def init_test_case(self):
+        self.num_classes = 81
+        self.resolution = 14
+        self.batch_size = 2
+        self.batch_size_per_im = 64
+        self.images_shape = [100, 200]
+        np.random.seed(0)
+
+    def make_generate_proposal_labels_out(self):
+        rois = []
+        self.rois_lod = [[]]
+        self.label_int32 = []
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(self.batch_size_per_im)
+            for i in range(self.batch_size_per_im):
+                xywh = np.random.rand(4)
+                xy1 = xywh[0:2] * 2
+                wh = xywh[2:4] * (self.images_shape[0] - xy1)
+                xy2 = xy1 + wh
+                roi = [xy1[0], xy1[1], xy2[0], xy2[1]]
+                rois.append(roi)
+        self.rois = np.array(rois).astype("float32")
+        for idx, roi_num in enumerate(self.rois_lod[0]):
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                if idx == 0:
+                    # set an image with no foreground, to test the empty case
+                    self.label_int32.append(0)
+                else:
+                    self.label_int32.append(class_id)
+        label_np = np.array(self.label_int32)
+        self.label_int32 = label_np[:, np.newaxis]
+
+    def generate_gt_polys(self):
+        h, w = self.images_shape[0:2]
+        self.gt_polys = []
+        self.gt_polys_list = []
+        max_gt = 4
+        max_poly_num = 5
+        min_poly_size = 4
+        max_poly_size = 16
+        lod0 = []
+        lod1 = []
+        lod2 = []
+        for i in range(self.batch_size):
+            gt_num = np.random.randint(1, high=max_gt, size=1)[0]
+            lod0.append(gt_num)
+            ptss = []
+            for i in range(gt_num):
+                poly_num = np.random.randint(1, max_poly_num, size=1)[0]
+                lod1.append(poly_num)
+                pts = []
+                for j in range(poly_num):
+                    poly_size = np.random.randint(
+                        min_poly_size, max_poly_size, size=1)[0]
+                    x = np.random.rand(poly_size, 1) * w
+                    y = np.random.rand(poly_size, 1) * h
+                    xy = np.concatenate((x, y), axis=1)
+                    pts.append(xy.flatten().tolist())
+                    self.gt_polys.extend(xy.flatten().tolist())
+                    lod2.append(poly_size)
+                ptss.append(pts)
+            self.gt_polys_list.append(ptss)
+        self.masks_lod = [lod0, lod1, lod2]
+        self.gt_lod = [lod0]
+        self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2)
+
+    def generate_groundtruth(self):
+        self.im_info = []
+        self.gt_classes = []
+        self.is_crowd = []
+        for roi_num in self.gt_lod[0]:
+            self.im_info.append(self.images_shape + [1.0])
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                self.gt_classes.append(class_id)
+                self.is_crowd.append(0)
+        self.im_info = np.array(self.im_info).astype(np.float32)
+        gt_classes_np = np.array(self.gt_classes)
+        self.gt_classes = gt_classes_np[:, np.newaxis]
+        is_crowd_np = np.array(self.is_crowd)
+        self.is_crowd = is_crowd_np[:, np.newaxis]
+
+    def init_test_output(self):
+        roi_lod = trans_lod(self.rois_lod[0])
+        gt_lod = trans_lod(self.gt_lod[0])
+        outs = generate_mask_labels(self.num_classes, self.im_info,
+                                    self.gt_classes, self.is_crowd,
+                                    self.label_int32, self.gt_polys_list,
+                                    self.resolution, self.rois, roi_lod, gt_lod)
+        self.mask_rois = outs[0]
+        self.roi_has_mask_int32 = outs[1]
+        self.mask_int32 = outs[2]
+        self.new_lod = outs[3]
+
+        self.mask_rois = np.vstack(self.mask_rois)
+        self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:,
+                                                                     np.newaxis]
+        self.mask_int32 = np.vstack(self.mask_int32)
+
+    def setUp(self):
+        self.op_type = "generate_mask_labels"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6..5f6328707fd80ec8f11b96cc65e2dcaf44496d58 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 9340d558577b4b3141df9317900ee33bbb683a0e..5ce405dccae4cfd66cde471c097698b0869f29fe 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index dfe4daca95af5e7b1aff93c6fa9027dec7c64642..baaddf9f2e5b123300f1d083b33ea644665348fd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -66,7 +66,141 @@ class MLP(fluid.imperative.Layer):
         return x
 
 
+class SimpleRNNCell(fluid.imperative.Layer):
+    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
+        super(SimpleRNNCell, self).__init__()
+        self.step_input_size = step_input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self._dype = core.VarDesc.VarType.FP32
+        from paddle.fluid.layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            'SimpleRNNCell', act="tanh", param_attr=param_attr)
+
+    def _build_once(self, inputs, pre_hidden):
+        i2h_param_shape = [self.step_input_size, self.hidden_size]
+        h2h_param_shape = [self.hidden_size, self.hidden_size]
+        h2o_param_shape = [self.output_size, self.hidden_size]
+        self._i2h_w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=i2h_param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        self._h2h_w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=h2h_param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        self._h2o_w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=h2o_param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, input, pre_hidden):
+
+        tmp_i2h = self._helper.create_variable_for_type_inference(self._dtype)
+        tmp_h2h = self._helper.create_variable_for_type_inference(self._dtype)
+        hidden = self._helper.create_variable_for_type_inference(self._dype)
+        out = self._helper.create_variable_for_type_inference(self._dype)
+        softmax_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        reduce_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": input,
+                    "Y": self._i2h_w},
+            outputs={"Out": tmp_i2h},
+            attrs={"x_num_col_dims": 1,
+                   "y_num_col_dims": 1})
+
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": pre_hidden,
+                    "Y": self._h2h_w},
+            outputs={"Out": tmp_h2h},
+            attrs={"x_num_col_dims": 1,
+                   "y_num_col_dims": 1})
+
+        self._helper.append_op(
+            type="elementwise_add",
+            inputs={'X': tmp_h2h,
+                    'Y': tmp_i2h},
+            outputs={'Out': hidden},
+            attrs={'axis': -1,
+                   'use_mkldnn': False})
+        hidden = self._helper.append_activation(hidden)
+
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": hidden,
+                    "Y": self._h2o_w},
+            outputs={"Out": out},
+            attrs={"x_num_col_dims": 1,
+                   "y_num_col_dims": 1})
+
+        self._helper.append_op(
+            type="softmax",
+            inputs={"X": out},
+            outputs={"Out": softmax_out},
+            attrs={"use_cudnn": False})
+
+        self._helper.append_op(
+            type='reduce_sum',
+            inputs={'X': softmax_out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': None,
+                   'keep_dim': False,
+                   'reduce_all': True})
+
+        return reduce_out, hidden
+
+
+class SimpleRNN(fluid.imperative.Layer):
+    def __init__(self):
+        super(SimpleRNN, self).__init__()
+        self.seq_len = 4
+        self._cell = SimpleRNNCell(
+            3,
+            3,
+            3,
+            fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.1)))
+
+    def forward(self, inputs):
+        outs = list()
+        pre_hiddens = list()
+
+        init_hidden = fluid.layers.tensor.create_parameter(
+            attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)),
+            shape=[1, 3],
+            dtype='float32',
+            is_bias=False)
+        pre_hidden = init_hidden
+        for i in range(self.seq_len):
+            input = fluid.layers.slice(
+                inputs, axes=[1], starts=[i], ends=[i + 1])
+            input = fluid.layers.reshape(input, shape=[1, 3])
+            out_softmax, pre_hidden = self._cell(input, pre_hidden)
+            outs.append(out_softmax)
+
+        return outs, pre_hiddens
+
+
 class TestImperative(unittest.TestCase):
+    def test_sum_op(self):
+        x = np.ones([2, 2], np.float32)
+        with fluid.imperative.guard():
+            inputs = []
+            for _ in range(10):
+                inputs.append(fluid.imperative.base.to_variable(x))
+            ret = fluid.layers.sums(inputs)
+            loss = fluid.layers.reduce_sum(ret)
+            loss._backward()
+            self.assertTrue(np.allclose(ret._numpy(), x * 10))
+            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
+
     def test_layer(self):
         with fluid.imperative.guard():
             cl = core.Layer()
@@ -133,7 +267,8 @@ class TestImperative(unittest.TestCase):
             x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[x1.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -160,7 +295,8 @@ class TestImperative(unittest.TestCase):
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -186,7 +322,8 @@ class TestImperative(unittest.TestCase):
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(fluid.default_startup_program())
 
             static_out, static_grad = exe.run(
@@ -196,6 +333,41 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
+    def test_rnn(self):
+        np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
+                           [10.0, 11.0, 12.0]])
+        np_inp = np_inp.reshape((1, 4, 3))
+        np_inp = np_inp.astype(np.float32)
+        with fluid.imperative.guard():
+            var_inp = fluid.imperative.base.to_variable(np_inp)
+            var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
+            simple_rnn = SimpleRNN()
+            outs, pre_hiddens = simple_rnn.forward(var_inp)
+            dy_out = outs[3]._numpy()
+            outs[3]._backward()
+            dy_grad_h2o = simple_rnn._cell._h2o_w._gradient()
+            dy_grad_h2h = simple_rnn._cell._h2h_w._gradient()
+            dy_grad_i2h = simple_rnn._cell._i2h_w._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[1, 4, 3], append_batch_size=False)
+            simple_rnn = SimpleRNN()
+            outs, pre_hiddens = simple_rnn(inp)
+            param_grads = fluid.backward.append_backward(outs[3])
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+            static_out, static_grad_h2o, static_grad_h2h, static_grad_i2h = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[
+                    outs[3].name, param_grads[0][1].name,
+                    param_grads[1][1].name, param_grads[2][1].name
+                ])
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
+        self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
+        self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 4fe286f85ec551946a9431f70d7012b4e7d79662..681661bfc63db95653be371688a047efe96f3866 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -20,6 +20,7 @@ import sys
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
 
 
 class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_cpu_float32(self):
+    def test_gan_float32(self):
         seed = 90
 
         startup = fluid.Program()
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
-        exe = fluid.Executor(fluid.CPUPlace())
+        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0))
         static_params = dict()
         with fluid.scope_guard(scope):
             img = np.ones([2, 1], np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 63eeae4b712c2064309b664b91d5f0347b67817d..08b155acc657c3a4a73f5b1d72ac356fc7e83a58 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -82,13 +82,14 @@ class MNIST(fluid.imperative.Layer):
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
             20, 50, 5, 2, 2, act="relu")
 
-        pool_2_shape = 50 * 8 * 8
+        pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
         self._fc = FC(10,
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)))
+                              loc=0.0, scale=scale)),
+                      act="softmax")
 
     def forward(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
@@ -98,9 +99,9 @@ class MNIST(fluid.imperative.Layer):
 
 
 class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_cpu_float32(self):
+    def test_mnist_float32(self):
         seed = 90
-
+        batch_num = 2
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -112,15 +113,15 @@ class TestImperativeMnist(unittest.TestCase):
 
             dy_param_init_value = {}
             for batch_id, data in enumerate(train_reader()):
-                if batch_id >= 2:
+                if batch_id >= batch_num:
                     break
 
-                x_data = np.array(
+                dy_x_data = np.array(
                     [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
                 y_data = np.array([x[1] for x in data]).astype('int64').reshape(
                     128, 1)
 
-                img = to_variable(x_data)
+                img = to_variable(dy_x_data)
                 label = to_variable(y_data)
                 label._stop_gradient = True
 
@@ -136,6 +137,7 @@ class TestImperativeMnist(unittest.TestCase):
 
                 avg_loss._backward()
                 sgd.minimize(avg_loss)
+                mnist.clear_gradients()
                 dy_param_value = {}
                 for param in fluid.default_main_program().global_block(
                 ).all_parameters():
@@ -145,7 +147,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
@@ -174,10 +177,10 @@ class TestImperativeMnist(unittest.TestCase):
                 static_param_init_value[static_param_name_list[i]] = out[i]
 
             for batch_id, data in enumerate(train_reader()):
-                if batch_id >= 2:
+                if batch_id >= batch_num:
                     break
 
-                x_data = np.array(
+                static_x_data = np.array(
                     [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
                 y_data = np.array([x[1] for x in data]).astype('int64').reshape(
                     [128, 1])
@@ -185,7 +188,7 @@ class TestImperativeMnist(unittest.TestCase):
                 fetch_list = [avg_loss.name]
                 fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": x_data,
+                              feed={"pixel": static_x_data,
                                     "label": y_data},
                               fetch_list=fetch_list)
 
@@ -195,11 +198,12 @@ class TestImperativeMnist(unittest.TestCase):
                     static_param_value[static_param_name_list[i - 1]] = out[i]
 
         for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(
-                np.allclose(value.all(), dy_param_init_value[key].all()))
-        self.assertTrue(np.allclose(static_out.all(), dy_out.all()))
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value.all(), dy_param_value[key].all()))
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe990e74ff96dfbca4f335b561f9bbe7d295246
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -0,0 +1,350 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid.imperative.nn import Embedding
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+import numpy as np
+import six
+from paddle.fluid.backward import append_backward
+
+
+class SimpleLSTMRNN(fluid.imperative.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_steps,
+                 num_layers=2,
+                 init_scale=0.1,
+                 dropout=None):
+        super(SimpleLSTMRNN, self).__init__()
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        self._init_scale = init_scale
+        self._dropout = dropout
+        self._input = None
+        self._num_steps = num_steps
+
+    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
+        self.weight_1_arr = []
+        self.weight_2_arr = []
+        self.bias_arr = []
+        self.hidden_array = []
+        self.cell_array = []
+        self.mask_array = []
+
+        for i in range(self._num_layers):
+            weight_1 = fluid.layers.create_parameter(
+                shape=[self._hidden_size * 2, self._hidden_size * 4],
+                dtype="float32",
+                name="fc_weight1_" + str(i),
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_1_arr.append(weight_1)
+            bias_1 = fluid.layers.create_parameter(
+                [self._hidden_size * 4],
+                dtype="float32",
+                name="fc_bias1_" + str(i),
+                default_initializer=fluid.initializer.Constant(0.0))
+            self.bias_arr.append(bias_1)
+
+            pre_hidden = fluid.layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = fluid.layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(
+                pre_hidden, shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(
+                pre_cell, shape=[-1, self._hidden_size])
+            self.hidden_array.append(pre_hidden)
+            self.cell_array.append(pre_cell)
+
+    def parameters(self):
+        parameters = list()
+        for param in self.weight_1_arr:
+            parameters.append(param)
+        for param in self.weight_2_arr:
+            parameters.append(param)
+        for bias in self.bias_arr:
+            parameters.append(bias)
+        return parameters
+
+    def forward(self, input_embedding, init_hidden=None, init_cell=None):
+        res = []
+        for index in range(self._num_steps):
+            self._input = fluid.layers.slice(
+                input_embedding, axes=[1], starts=[index], ends=[index + 1])
+            self._input = fluid.layers.reshape(
+                self._input, shape=[-1, self._hidden_size])
+            for k in range(self._num_layers):
+                pre_hidden = self.hidden_array[k]
+                pre_cell = self.cell_array[k]
+                weight_1 = self.weight_1_arr[k]
+                bias = self.bias_arr[k]
+
+                nn = fluid.layers.concat([self._input, pre_hidden], 1)
+                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+
+                gate_input = fluid.layers.elementwise_add(gate_input, bias)
+                i, j, f, o = fluid.layers.split(
+                    gate_input, num_or_sections=4, dim=-1)
+                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                    i) * fluid.layers.tanh(j)
+                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                self.hidden_array[k] = m
+                self.cell_array[k] = c
+                self._input = m
+
+                if self._dropout is not None and self._dropout > 0.0:
+                    self._input = fluid.layers.dropout(
+                        self._input,
+                        dropout_prob=self._dropout,
+                        dropout_implementation='upscale_in_train')
+            res.append(
+                fluid.layers.reshape(
+                    self._input, shape=[1, -1, self._hidden_size]))
+        real_res = fluid.layers.concat(res, 0)
+        real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
+        last_hidden = fluid.layers.concat(self.hidden_array, 1)
+        last_hidden = fluid.layers.reshape(
+            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
+        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
+        last_cell = fluid.layers.concat(self.cell_array, 1)
+        last_cell = fluid.layers.reshape(
+            last_cell, shape=[-1, self._num_layers, self._hidden_size])
+        last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
+        return real_res, last_hidden, last_cell
+
+
+class PtbModel(fluid.imperative.Layer):
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 num_layers=2,
+                 num_steps=20,
+                 init_scale=0.1,
+                 dropout=None):
+        super(PtbModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.init_scale = init_scale
+        self.num_layers = num_layers
+        self.num_steps = num_steps
+        self.dropout = dropout
+        self.simple_lstm_rnn = SimpleLSTMRNN(
+            hidden_size,
+            num_steps,
+            num_layers=num_layers,
+            init_scale=init_scale,
+            dropout=dropout)
+        self.embedding = Embedding(
+            size=[vocab_size, hidden_size],
+            dtype='float32',
+            is_sparse=False,
+            param_attr=fluid.ParamAttr(
+                name='embedding_para',
+                initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale)))
+        self.softmax_weight = fluid.layers.create_parameter(
+            [self.hidden_size, self.vocab_size],
+            dtype="float32",
+            name="softmax_weight",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+        self.softmax_bias = fluid.layers.create_parameter(
+            [self.vocab_size],
+            dtype="float32",
+            name='softmax_bias',
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+
+    def _build_once(self, input, label, init_hidden, init_cell):
+        pass
+
+    def parameters(self):
+        parameters = self.simple_lstm_rnn.parameters() + [
+            self.softmax_weight, self.softmax_bias
+        ] + self.embedding.parameters()
+        return parameters
+
+    def forward(self, input, label, init_hidden, init_cell):
+
+        init_h = fluid.layers.reshape(
+            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+
+        init_c = fluid.layers.reshape(
+            init_cell, shape=[self.num_layers, -1, self.hidden_size])
+
+        x_emb = self.embedding(input)
+        x_emb = fluid.layers.reshape(
+            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        if self.dropout is not None and self.dropout > 0.0:
+            x_emb = fluid.layers.dropout(
+                x_emb,
+                dropout_prob=self.drop_out,
+                dropout_implementation='upscale_in_train')
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
+                                                               init_c)
+        rnn_out = fluid.layers.reshape(
+            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
+        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
+        projection = fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        projection = fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
+        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = fluid.layers.reduce_sum(loss)
+        loss.permissions = True
+
+        return loss, last_hidden, last_cell
+
+
+class TestImperativePtbRnn(unittest.TestCase):
+    def test_ptb_rnn_cpu_float32(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+            for i in range(2):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                x_data = x_data.reshape((-1, num_steps, 1))
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param._numpy()
+                dy_loss._backward()
+                sgd.minimize(dy_loss)
+                for param in ptb_model.parameters():
+                    dy_param_updated[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64')
+            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
+            init_hidden = fluid.layers.data(
+                name="init_hidden", shape=[1], dtype='float32')
+            init_cell = fluid.layers.data(
+                name="init_cell", shape=[1], dtype='float32')
+
+            static_loss, static_last_hidden, static_last_cell = ptb_model(
+                x, y, init_hidden, init_cell)
+            sgd.minimize(static_loss)
+            static_param_updated = dict()
+            static_param_init = dict()
+            static_param_name_list = list()
+            for param in ptb_model.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(framework.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init[static_param_name_list[i]] = out[i]
+            static_loss_value = None
+            static_last_cell_value = None
+            static_last_hidden_value = None
+            for i in range(2):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                x_data = x_data.reshape((-1, num_steps, 1))
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                fetch_list = [static_loss, static_last_hidden, static_last_cell]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "x": x_data,
+                                  "y": y_data,
+                                  "init_hidden": init_hidden_data,
+                                  "init_cell": init_cell_data
+                              },
+                              fetch_list=fetch_list)
+                static_loss_value = out[0]
+                static_last_cell_value = out[1]
+                static_last_hidden_value = out[2]
+                for k in range(3, len(out)):
+                    static_param_updated[static_param_name_list[k - 3]] = out[k]
+
+            self.assertTrue(
+                np.allclose(static_loss_value.all(), dy_loss._numpy().all()))
+            self.assertTrue(
+                np.allclose(static_last_cell_value.all(),
+                            last_cell._numpy().all()))
+            self.assertTrue(
+                np.allclose(static_last_hidden_value.all(),
+                            last_hidden._numpy().all()))
+            for key, value in six.iteritems(static_param_init):
+                self.assertTrue(
+                    np.allclose(value.all(), dy_param_init[key].all()))
+            for key, value in six.iteritems(static_param_updated):
+                self.assertTrue(
+                    np.allclose(value.all(), dy_param_updated[key].all()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27fd0b8024a8fa3310a62de34299fb621e2902f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+
+batch_size = 8
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": batch_size,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    },
+    "batch_size": batch_size,
+    "lr": 0.1,
+    "total_images": 1281164,
+}
+
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    if ls["name"] == "piecewise_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        # TODO(minqiyang): Add learning rate scheduler support to imperative mode
+        #  optimizer = fluid.optimizer.Momentum(
+    #  learning_rate=params["lr"],
+    #  learning_rate=fluid.layers.piecewise_decay(
+    #  boundaries=bd, values=lr),
+    #  momentum=0.9,
+    #  regularization=fluid.regularizer.L2Decay(1e-4))
+
+    return optimizer
+
+
+class ConvBNLayer(fluid.imperative.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None)
+
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(fluid.imperative.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+
+        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(fluid.imperative.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        num_channels = 64
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = BottleneckBlock(
+                    num_channels=num_channels,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    shortcut=shortcut)
+                num_channels = bottleneck_block._num_channels_out
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = FC(size=class_dim,
+                      act='softmax',
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = self.out(y)
+        return y
+
+
+class TestImperativeResnet(unittest.TestCase):
+    def test_resnet_float32(self):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            dy_param_init_value = {}
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                dy_param_init_value[param.name] = param._numpy()
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+
+                out = resnet(img)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+
+                dy_out = avg_loss._numpy()
+
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        if param.name not in dy_param_init_value:
+                            dy_param_init_value[param.name] = param._numpy()
+
+                avg_loss._backward()
+
+                dy_grad_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    if not param.stop_gradient:
+                        np_array = np.array(param._ivar._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + core.grad_var_suffix(
+                        )] = np_array
+
+                optimizer.minimize(avg_loss)
+                resnet.clear_gradients()
+
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = resnet(img)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            avg_loss = fluid.layers.mean(x=loss)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            static_grad_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                if not param.stop_gradient:
+                    static_grad_name_list.append(param.name +
+                                                 core.grad_var_suffix())
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [batch_size, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                fetch_list.extend(static_grad_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_grad_value = {}
+                static_out = out[0]
+                param_start_pos = 1
+                grad_start_pos = len(static_param_name_list) + param_start_pos
+                for i in range(param_start_pos,
+                               len(static_param_name_list) + param_start_pos):
+                    static_param_value[static_param_name_list[
+                        i - param_start_pos]] = out[i]
+                for i in range(grad_start_pos,
+                               len(static_grad_name_list) + grad_start_pos):
+                    static_grad_value[static_grad_name_list[
+                        i - grad_start_pos]] = out[i]
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index ab7183f88df809e584ca50ba16221bfdfe1376a9..2d98b063d10e2bb9071c4b8dc4ac9373f63df387 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -420,5 +420,26 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.type, 'assign_value')
 
 
+class TestNumpyArrayInitializer(unittest.TestCase):
+    def test_numpy_array_initializer(self):
+        """Test the numpy array initializer with supplied arguments
+        """
+        import numpy
+        program = framework.Program()
+        block = program.global_block()
+        np_array = numpy.random.random((10000)).astype("float32")
+        for _ in range(2):
+            block.create_parameter(
+                dtype=np_array.dtype,
+                shape=np_array.shape,
+                lod_level=0,
+                name="param",
+                initializer=initializer.NumpyArrayInitializer(np_array))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+        assert (init_op.attr('fp32_values') == np_array).all()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 90f5d797a67d951e618e64cfc5a3608335714e05..e7bc1601a54c8615e0e787d74145aa4987b6cb88 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -58,7 +58,8 @@ class TestBook(unittest.TestCase):
     def test_simple_conv2d(self):
         program = Program()
         with program_guard(program, startup_program=Program()):
-            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            images = layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
             layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
 
         print(str(program))
@@ -1023,6 +1024,14 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_shuffle_channel(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.shuffle_channel(x, group=4)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 9778bd694de4b21f3ff723846c77a8ad0dceb57b..8fc391a1ff2529460b038979c0c7d0a9d905a7e0 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -19,7 +19,7 @@ import copy
 from op_test import OpTest
 
 
-def iou(box_a, box_b):
+def iou(box_a, box_b, norm):
     """Apply intersection-over-union overlap between box_a and box_b
     """
     xmin_a = min(box_a[0], box_a[2])
@@ -32,8 +32,10 @@ def iou(box_a, box_b):
     xmax_b = max(box_b[0], box_b[2])
     ymax_b = max(box_b[1], box_b[3])
 
-    area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
-    area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
+    area_a = (ymax_a - ymin_a + (norm == False)) * (xmax_a - xmin_a +
+                                                    (norm == False))
+    area_b = (ymax_b - ymin_b + (norm == False)) * (xmax_b - xmin_b +
+                                                    (norm == False))
     if area_a <= 0 and area_b <= 0:
         return 0.0
 
@@ -42,17 +44,21 @@ def iou(box_a, box_b):
     xb = min(xmax_a, xmax_b)
     yb = min(ymax_a, ymax_b)
 
-    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
-
-    box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
-    box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+    inter_area = max(xb - xa + (norm == False),
+                     0.0) * max(yb - ya + (norm == False), 0.0)
 
     iou_ratio = inter_area / (area_a + area_b - inter_area)
 
     return iou_ratio
 
 
-def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
+def nms(boxes,
+        scores,
+        score_threshold,
+        nms_threshold,
+        top_k=200,
+        normalized=True,
+        eta=1.0):
     """Apply non-maximum suppression at test time to avoid detecting too many
     overlapping bounding boxes for a given object.
     Args:
@@ -87,7 +93,7 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
         for k in range(len(selected_indices)):
             if keep:
                 kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx])
+                overlap = iou(boxes[idx], boxes[kept_idx], normalized)
                 keep = True if overlap <= adaptive_threshold else False
             else:
                 break
@@ -99,16 +105,24 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
 
 
 def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
-                   nms_top_k, keep_top_k):
-    class_num = scores.shape[0]
-    priorbox_num = scores.shape[1]
+                   nms_top_k, keep_top_k, normalized, shared):
+    if shared:
+        class_num = scores.shape[0]
+        priorbox_num = scores.shape[1]
+    else:
+        box_num = scores.shape[0]
+        class_num = scores.shape[1]
 
     selected_indices = {}
     num_det = 0
     for c in range(class_num):
         if c == background: continue
-        indices = nms(boxes, scores[c], score_threshold, nms_threshold,
-                      nms_top_k)
+        if shared:
+            indices = nms(boxes, scores[c], score_threshold, nms_threshold,
+                          nms_top_k, normalized)
+        else:
+            indices = nms(boxes[:, c, :], scores[:, c], score_threshold,
+                          nms_threshold, nms_top_k, normalized)
         selected_indices[c] = indices
         num_det += len(indices)
 
@@ -116,7 +130,10 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
         score_index = []
         for c, indices in selected_indices.items():
             for idx in indices:
-                score_index.append((scores[c][idx], c, idx))
+                if shared:
+                    score_index.append((scores[c][idx], c, idx))
+                else:
+                    score_index.append((scores[idx][c], c, idx))
 
         sorted_score_index = sorted(
             score_index, key=lambda tup: tup[0], reverse=True)
@@ -127,24 +144,75 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
             selected_indices[c] = []
         for s, c, idx in sorted_score_index:
             selected_indices[c].append(idx)
+        if not shared:
+            for labels in selected_indices:
+                selected_indices[labels].sort()
         num_det = keep_top_k
 
     return selected_indices, num_det
 
 
-def batched_multiclass_nms(boxes, scores, background, score_threshold,
-                           nms_threshold, nms_top_k, keep_top_k):
+def lod_multiclass_nms(boxes, scores, background, score_threshold,
+                       nms_threshold, nms_top_k, keep_top_k, box_lod,
+                       normalized):
+    det_outs = []
+    lod = []
+    head = 0
+    for n in range(len(box_lod[0])):
+        box = boxes[head:head + box_lod[0][n]]
+        score = scores[head:head + box_lod[0][n]]
+        head = head + box_lod[0][n]
+        nmsed_outs, nmsed_num = multiclass_nms(
+            box,
+            score,
+            background,
+            score_threshold,
+            nms_threshold,
+            nms_top_k,
+            keep_top_k,
+            normalized,
+            shared=False)
+        if nmsed_num == 0:
+            #lod.append(1)
+            continue
+        lod.append(nmsed_num)
+        for c, indices in nmsed_outs.items():
+            for idx in indices:
+                xmin, ymin, xmax, ymax = box[idx, c, :]
+                det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax])
+    if len(lod) == 0:
+        lod.append(1)
+
+    return det_outs, lod
+
+
+def batched_multiclass_nms(boxes,
+                           scores,
+                           background,
+                           score_threshold,
+                           nms_threshold,
+                           nms_top_k,
+                           keep_top_k,
+                           normalized=True):
     batch_size = scores.shape[0]
 
     det_outs = []
     lod = []
     for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
-                                               score_threshold, nms_threshold,
-                                               nms_top_k, keep_top_k)
-        lod.append(nmsed_num)
-        if nmsed_num == 0: continue
+        nmsed_outs, nmsed_num = multiclass_nms(
+            boxes[n],
+            scores[n],
+            background,
+            score_threshold,
+            nms_threshold,
+            nms_top_k,
+            keep_top_k,
+            normalized,
+            shared=True)
+        if nmsed_num == 0:
+            continue
 
+        lod.append(nmsed_num)
         tmp_det_out = []
         for c, indices in nmsed_outs.items():
             for idx in indices:
@@ -154,7 +222,8 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         sorted_det_out = sorted(
             tmp_det_out, key=lambda tup: tup[0], reverse=False)
         det_outs.extend(sorted_det_out)
-
+    if len(lod) == 0:
+        lod += [1]
     return det_outs, lod
 
 
@@ -168,7 +237,6 @@ class TestMulticlassNMSOp(OpTest):
         M = 1200
         C = 21
         BOX_SIZE = 4
-
         background = 0
         nms_threshold = 0.3
         nms_top_k = 400
@@ -206,6 +274,7 @@ class TestMulticlassNMSOp(OpTest):
             'keep_top_k': keep_top_k,
             'score_threshold': score_threshold,
             'nms_eta': 1.0,
+            'normalized': True,
         }
 
     def test_check_output(self):
@@ -219,13 +288,70 @@ class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
         self.score_threshold = 2.0
 
 
+class TestMulticlassNMSLoDInput(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+
+    def setUp(self):
+        self.set_argument()
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        box_lod = [[1200]]
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        normalized = False
+
+        scores = np.random.random((M, C)).astype('float32')
+
+        def softmax(x):
+            shiftx = x - np.max(x).clip(-64.)
+            exps = np.exp(shiftx)
+            return exps / np.sum(exps)
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+
+        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0] = boxes[:, :, 0] * 10
+        boxes[:, :, 1] = boxes[:, :, 1] * 10
+        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
+        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
+
+        nmsed_outs, lod = lod_multiclass_nms(
+            boxes, scores, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k, box_lod, normalized)
+        nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
+        nmsed_outs = np.array(nmsed_outs).astype('float32')
+        self.op_type = 'multiclass_nms'
+        self.inputs = {
+            'BBoxes': (boxes, box_lod),
+            'Scores': (scores, box_lod),
+        }
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': normalized,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestIOU(unittest.TestCase):
     def test_iou(self):
         box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
         box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
 
         expt_output = np.array([2.0 / 16.0]).astype('float32')
-        calc_output = np.array([iou(box1, box2)]).astype('float32')
+        calc_output = np.array([iou(box1, box2, True)]).astype('float32')
         self.assertTrue(np.allclose(calc_output, expt_output))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 242709425f2d3f190d3c1ed795d30938fb8e23fe..5bb2260ef7a143670dd75fc88769603d1437173d 100644
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -24,7 +24,8 @@ def nearest_neighbor_interp_np(X,
                                out_h,
                                out_w,
                                out_size=None,
-                               actual_shape=None):
+                               actual_shape=None,
+                               align_corners=True):
     """nearest neighbor interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
         out_h = out_size[0]
@@ -35,17 +36,31 @@ def nearest_neighbor_interp_np(X,
     n, c, in_h, in_w = X.shape
 
     ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((n, c, out_h, out_w))
-    for i in range(out_h):
-        in_i = int(ratio_h * i + 0.5)
-        for j in range(out_w):
-            in_j = int(ratio_w * j + 0.5)
-            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    if align_corners:
+        for i in range(out_h):
+            in_i = int(ratio_h * i + 0.5)
+            for j in range(out_w):
+                in_j = int(ratio_w * j + 0.5)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+    else:
+        for i in range(out_h):
+            in_i = int(ratio_h * i)
+            for j in range(out_w):
+                in_j = int(ratio_w * j)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
 
     return out.astype(X.dtype)
 
@@ -59,7 +74,8 @@ class TestNearestInterpOp(OpTest):
         input_np = np.random.random(self.input_shape).astype("float32")
 
         output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
-                                               self.out_size, self.actual_shape)
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -68,7 +84,8 @@ class TestNearestInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
         }
         self.outputs = {'Out': output_np}
 
@@ -84,6 +101,7 @@ class TestNearestInterpOp(OpTest):
         self.out_h = 2
         self.out_w = 2
         self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
@@ -92,6 +110,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
@@ -100,6 +119,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
@@ -108,6 +128,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
@@ -117,6 +138,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp):
         self.out_h = 1
         self.out_w = 1
         self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
@@ -126,6 +148,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp):
         self.out_h = 12
         self.out_w = 12
         self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
@@ -135,6 +158,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
         self.out_h = 64
         self.out_w = 128
         self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
@@ -144,6 +168,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
         self.out_h = 64
         self.out_w = 32
         self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestInterpOpUint8(OpTest):
@@ -155,14 +180,16 @@ class TestNearestInterpOpUint8(OpTest):
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
         output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
-                                               self.out_size, self.actual_shape)
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners
         }
         self.outputs = {'Out': output_np}
 
@@ -174,6 +201,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
@@ -182,6 +210,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
@@ -191,6 +220,12 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
         self.out_h = 5
         self.out_w = 13
         self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+    def set_align_corners(self):
+        self.align_corners = False
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index ffd4026dbade2f8f7eace399c52ae0428f3e8d7b..d33a57f675aa98cf13e1ac0014109d9cb3856e87 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -81,11 +81,10 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 0}
+        self.attrs = {'ref_level': 1}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeaae9058187be1c9191bcbec21237c69fefe6e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestShuffleChannelOp(OpTest):
+    def setUp(self):
+        self.op_type = "shuffle_channel"
+        self.batch_size = 10
+        self.input_channels = 16
+        self.layer_h = 4
+        self.layer_w = 4
+        self.group = 4
+        self.x = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+        self.inputs = {'X': self.x}
+        self.attrs = {'group': self.group}
+        n, c, h, w = self.x.shape
+        input_reshaped = np.reshape(self.x,
+                                    (-1, self.group, c // self.group, h, w))
+        input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
+        self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 41797a241cab9f2b3bc4b492a1c4b6db89ac2948..ae1883f1f7e44e06e378ff6d16dbc3c5060027e4 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -18,6 +18,7 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import paddle.fluid.core as core
 import unittest
 
 
@@ -117,5 +118,36 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithNorm(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 544fe4b4f81909b69a05d9751316e3d3137fdc45..020c1139230a9177c4d7765367359d91839d7d46 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -16,174 +16,179 @@ from __future__ import division
 
 import unittest
 import numpy as np
+from scipy.special import logit
+from scipy.special import expit
 from op_test import OpTest
 
 from paddle.fluid import core
 
 
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
+def l2loss(x, y):
+    return 0.5 * (y - x) * (y - x)
 
 
-def mse(x, y, num):
-    return ((y - x)**2).sum() / num
+def sce(x, label):
+    sigmoid_x = expit(x)
+    term1 = label * np.log(sigmoid_x)
+    term2 = (1.0 - label) * np.log(1.0 - sigmoid_x)
+    return -term1 - term2
 
 
-def bce(x, y, mask):
-    x = x.reshape((-1))
-    y = y.reshape((-1))
-    mask = mask.reshape((-1))
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
 
-    error_sum = 0.0
-    count = 0
-    for i in range(x.shape[0]):
-        if mask[i] > 0:
-            error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
-            count += 1
-    return error_sum / (-1.0 * count)
 
+def batch_xywh_box_iou(box1, box2):
+    b1_left = box1[:, :, 0] - box1[:, :, 2] / 2
+    b1_right = box1[:, :, 0] + box1[:, :, 2] / 2
+    b1_top = box1[:, :, 1] - box1[:, :, 3] / 2
+    b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2
 
-def box_iou(box1, box2):
-    b1_x1 = box1[0] - box1[2] / 2
-    b1_x2 = box1[0] + box1[2] / 2
-    b1_y1 = box1[1] - box1[3] / 2
-    b1_y2 = box1[1] + box1[3] / 2
-    b2_x1 = box2[0] - box2[2] / 2
-    b2_x2 = box2[0] + box2[2] / 2
-    b2_y1 = box2[1] - box2[3] / 2
-    b2_y2 = box2[1] + box2[3] / 2
+    b2_left = box2[:, :, 0] - box2[:, :, 2] / 2
+    b2_right = box2[:, :, 0] + box2[:, :, 2] / 2
+    b2_top = box2[:, :, 1] - box2[:, :, 3] / 2
+    b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2
 
-    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
-    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+    left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
+    right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
+    top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
+    bottom = np.minimum(b1_bottom[:, :, np.newaxis],
+                        b2_bottom[:, np.newaxis, :])
 
-    inter_rect_x1 = max(b1_x1, b2_x1)
-    inter_rect_y1 = max(b1_y1, b2_y1)
-    inter_rect_x2 = min(b1_x2, b2_x2)
-    inter_rect_y2 = min(b1_y2, b2_y2)
-    inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
-        inter_rect_y2 - inter_rect_y1, 0)
+    inter_w = np.clip(right - left, 0., 1.)
+    inter_h = np.clip(bottom - top, 0., 1.)
+    inter_area = inter_w * inter_h
 
-    return inter_area / (b1_area + b2_area + inter_area)
+    b1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
+    b2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
+    union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area
 
+    return inter_area / union
 
-def build_target(gtboxs, gtlabel, attrs, grid_size):
-    n, b, _ = gtboxs.shape
-    ignore_thresh = attrs["ignore_thresh"]
-    anchors = attrs["anchors"]
-    class_num = attrs["class_num"]
-    an_num = len(anchors) // 2
-    obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
-    tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tcls = np.zeros(
-        (n, an_num, grid_size, grid_size, class_num)).astype('float32')
 
+def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+    n, c, h, w = x.shape
+    b = gtbox.shape[1]
+    anchors = attrs['anchors']
+    an_num = len(anchors) // 2
+    anchor_mask = attrs['anchor_mask']
+    mask_num = len(anchor_mask)
+    class_num = attrs["class_num"]
+    ignore_thresh = attrs['ignore_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+    x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+    loss = np.zeros((n)).astype('float32')
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
+                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
+                                 class_num)
+
+    mask_anchors = []
+    for m in anchor_mask:
+        mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_obj = x[:, :, :, :, 4].reshape((n, -1))
+    objness = np.zeros(pred_box.shape[:2]).astype('float32')
+    ious = batch_xywh_box_iou(pred_box, gtbox)
+    ious_max = np.max(ious, axis=-1)
+    objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness),
+                       objness)
+
+    gtbox_shift = gtbox.copy()
+    gtbox_shift[:, :, 0] = 0
+    gtbox_shift[:, :, 1] = 0
+
+    anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_boxes = np.concatenate(
+        [np.zeros_like(anchors_s), anchors_s], axis=-1)
+    anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
+    ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
+    iou_matches = np.argmax(ious, axis=-1)
+    gt_matches = iou_matches.copy()
     for i in range(n):
         for j in range(b):
-            if gtboxs[i, j, :].sum() == 0:
+            if gtbox[i, j, 2:].sum() == 0:
+                gt_matches[i, j] = -1
                 continue
+            if iou_matches[i, j] not in anchor_mask:
+                gt_matches[i, j] = -1
+                continue
+            an_idx = anchor_mask.index(iou_matches[i, j])
+            gt_matches[i, j] = an_idx
+            gi = int(gtbox[i, j, 0] * w)
+            gj = int(gtbox[i, j, 1] * h)
 
-            gt_label = gtlabel[i, j]
-            gx = gtboxs[i, j, 0] * grid_size
-            gy = gtboxs[i, j, 1] * grid_size
-            gw = gtboxs[i, j, 2] * grid_size
-            gh = gtboxs[i, j, 3] * grid_size
-
-            gi = int(gx)
-            gj = int(gy)
-
-            gtbox = [0, 0, gw, gh]
-            max_iou = 0
-            for k in range(an_num):
-                anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
-                iou = box_iou(gtbox, anchor_box)
-                if iou > max_iou:
-                    max_iou = iou
-                    best_an_index = k
-                if iou > ignore_thresh:
-                    noobj_mask[i, best_an_index, gj, gi] = 0
-
-            obj_mask[i, best_an_index, gj, gi] = 1
-            noobj_mask[i, best_an_index, gj, gi] = 0
-            tx[i, best_an_index, gj, gi] = gx - gi
-            ty[i, best_an_index, gj, gi] = gy - gj
-            tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
-                                                               best_an_index])
-            th[i, best_an_index, gj, gi] = np.log(
-                gh / anchors[2 * best_an_index + 1])
-            tconf[i, best_an_index, gj, gi] = 1
-            tcls[i, best_an_index, gj, gi, gt_label] = 1
-
-    return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
-
-
-def YoloV3Loss(x, gtbox, gtlabel, attrs):
-    n, c, h, w = x.shape
-    an_num = len(attrs['anchors']) // 2
-    class_num = attrs["class_num"]
-    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
-    pred_x = sigmoid(x[:, :, :, :, 0])
-    pred_y = sigmoid(x[:, :, :, :, 1])
-    pred_w = x[:, :, :, :, 2]
-    pred_h = x[:, :, :, :, 3]
-    pred_conf = sigmoid(x[:, :, :, :, 4])
-    pred_cls = sigmoid(x[:, :, :, :, 5:])
-
-    tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
-        gtbox, gtlabel, attrs, x.shape[2])
-
-    obj_mask_expand = np.tile(
-        np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
-    loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
-    loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
-    loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
-    loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
-    loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
-    loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
-                             noobj_mask)
-    loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
-                     obj_mask_expand)
-
-    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
-            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
-            + attrs['loss_weight_conf_target'] * loss_conf_target \
-            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
-            + attrs['loss_weight_class'] * loss_class
+            tx = gtbox[i, j, 0] * w - gi
+            ty = gtbox[i, j, 1] * w - gj
+            tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
+            th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
+            loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+
+            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+
+            for label_idx in range(class_num):
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
+                               float(label_idx == gtlabel[i, j]))
+
+        for j in range(mask_num * h * w):
+            if objness[i, j] > 0:
+                loss[i] += sce(pred_obj[i, j], 1.0)
+            elif objness[i, j] == 0:
+                loss[i] += sce(pred_obj[i, j], 0.0)
+
+    return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \
+            gt_matches.astype('int32'))
 
 
 class TestYolov3LossOp(OpTest):
     def setUp(self):
-        self.loss_weight_xy = 1.0
-        self.loss_weight_wh = 1.0
-        self.loss_weight_conf_target = 1.0
-        self.loss_weight_conf_notarget = 1.0
-        self.loss_weight_class = 1.0
         self.initTestCase()
         self.op_type = 'yolov3_loss'
-        x = np.random.random(size=self.x_shape).astype('float32')
+        x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
         gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
-        gtlabel = np.random.randint(0, self.class_num,
-                                    self.gtbox_shape[:2]).astype('int32')
+        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
+        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
+        gtbox = gtbox * gtmask[:, :, np.newaxis]
+        gtlabel = gtlabel * gtmask
 
         self.attrs = {
             "anchors": self.anchors,
+            "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "loss_weight_xy": self.loss_weight_xy,
-            "loss_weight_wh": self.loss_weight_wh,
-            "loss_weight_conf_target": self.loss_weight_conf_target,
-            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
-            "loss_weight_class": self.loss_weight_class,
+            "downsample": self.downsample,
         }
 
-        self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
+        self.inputs = {
+            'X': x,
+            'GTBox': gtbox.astype('float32'),
+            'GTLabel': gtlabel.astype('int32'),
+        }
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
         self.outputs = {
-            'Loss': np.array(
-                [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
+            'Loss': loss,
+            'ObjectnessMask': objness,
+            "GTMatchMask": gt_matches
         }
 
     def test_check_output(self):
@@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest):
             place, ['X'],
             'Loss',
             no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.06)
+            max_relative_error=0.3)
 
     def initTestCase(self):
-        self.anchors = [10, 13, 12, 12]
-        self.class_num = 10
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.anchor_mask = [1, 2]
+        self.class_num = 5
         self.ignore_thresh = 0.5
-        self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
-        self.gtbox_shape = (5, 10, 4)
-        self.loss_weight_xy = 2.5
-        self.loss_weight_wh = 0.8
-        self.loss_weight_conf_target = 1.5
-        self.loss_weight_conf_notarget = 0.5
-        self.loss_weight_class = 1.2
+        self.downsample = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index f33c05ed2f48c2498b98fc486d6ff7471088d77e..82d0d336e523ec48c5ceca3b92ff0963c4499123 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -17,3 +17,4 @@ from __future__ import print_function
 from .program_utils import *
 from .ufind import *
 from .checkport import *
+from .vars_distributed import *
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
index 6b78ceeaeec4d9b3db6524a5b5e939f88267340c..89dd4dd50b0299de986b84f46e889d554030f180 100644
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -16,6 +16,7 @@ import sys
 import time
 import socket
 from contextlib import closing
+from six import string_types
 
 
 def wait_server_ready(endpoints):
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
 
            wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
+    assert not isinstance(endpoints, string_types)
     while True:
         all_ok = True
         not_ready_endpoints = []
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
                     all_ok = False
                     not_ready_endpoints.append(ep)
         if not all_ok:
-            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
             sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
                              "\n")
             sys.stderr.flush()
diff --git a/python/paddle/fluid/transpiler/details/vars_distributed.py b/python/paddle/fluid/transpiler/details/vars_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..05e7f6e3e706376efc8af870a780d96c45642514
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/vars_distributed.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from paddle.fluid.framework import Variable
+
+
+class VarStruct(object):
+    """
+    record part properties of a Variable in python.
+    """
+
+    def __init__(self, name, shape, dtype, type, lod_level, persistable):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.type = type
+        self.lod_level = lod_level
+        self.persistable = persistable
+
+
+class VarDistributed(object):
+    """
+    a class to record the var distributed on parameter servers.
+    the class will record the relationship between origin var and slice var.
+    the slice var's properties, such as type/shape/offset/endpoint.
+    """
+
+    def __init__(self,
+                 origin_var,
+                 slice_var,
+                 is_slice=None,
+                 block_id=None,
+                 offset=None,
+                 vtype=None,
+                 endpoint=None):
+        """
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        """
+
+        if isinstance(origin_var, Variable):
+            self.origin = self.__create_var_struct(origin_var)
+        else:
+            self.origin = origin_var
+
+        if isinstance(slice_var, Variable):
+            self.slice = self.__create_var_struct(slice_var)
+        else:
+            self.slice = slice_var
+
+        if self.equal(self.origin, self.slice):
+            self.is_slice = False
+            self.block_id = 0
+            self.offset = 0
+        else:
+            self.is_slice = True
+            self.block_id = 0
+            self.offset = 0
+
+        if is_slice is not None:
+            self.is_slice = is_slice
+        if block_id is not None:
+            self.block_id = block_id
+        if offset is not None:
+            self.offset = offset
+
+        self.vtype = vtype
+        self.endpoint = endpoint
+
+    @staticmethod
+    def __create_var_struct(var):
+        return VarStruct(var.name, var.shape, var.dtype, var.type,
+                         var.lod_level, var.persistable)
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
+
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def __str__(self):
+        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
+            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
+                   shape=self.origin.shape, dtype=self.origin.dtype)
+
+        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
+                        ".slice({is_slice}).block({block_id}).offset({offset})". \
+            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
+                   shape=self.slice.shape, dtype=self.slice.dtype,
+                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
+
+        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
+            self.vtype, origin_var_str, slice_var_str, self.endpoint)
+
+
+class VarsDistributed(object):
+    """
+    a gather about VarDistributed with many methods to find distributed vars.
+    through the class, we can get overview about the distributed parameters on parameter servers.
+    this class may centralized and convenient for developer to manage and get variable's distribute.
+    other module can also use this to find variables such io.py.
+    """
+
+    def __init__(self):
+        self.distributed_vars = []
+
+    def add_distributed_var(self,
+                            origin_var,
+                            slice_var,
+                            is_slice=None,
+                            block_id=None,
+                            offset=None,
+                            vtype=None,
+                            endpoint=None):
+        """
+        add distributed var in this.
+
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        Returns:
+            None
+        """
+        self.distributed_vars.append(
+            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
+                           vtype, endpoint))
+
+    def get_distributed_var_by_slice(self, var_name):
+        """
+        get distributed var by conditions.
+
+        Args:
+            var_name(str): slice var name, such as "w.traier0.block1"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.slice.name == var_name:
+                return dist_var
+        return None
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
+        """
+        get distributed var by conditions.
+
+        Args:
+            origin_var_name(str):
+            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
+                return dist_var
+        return None
+
+    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+            groupby(bool|False): group by origin var or not.
+
+        Returns:
+            list: distributed var list.
+            dict: distributed var map when groupby=True
+        """
+        vtype_vars = []
+        for var in self.distributed_vars:
+            if var.vtype in vtypes:
+                vtype_vars.append(var)
+        if not groupby:
+            return vtype_vars
+
+        params_map = {}
+        for var in vtype_vars:
+            origin_var_name = var.origin.name
+
+            if origin_var_name in params_map.keys():
+                optimizers = params_map.get(origin_var_name)
+            else:
+                optimizers = []
+            optimizers.append(var)
+            params_map[origin_var_name] = optimizers
+        return params_map
+
+    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+
+        Returns:
+            list: distributed var list.
+        """
+        endpoint_vars = []
+        for var in self.distributed_vars:
+            if var.endpoint == endpoint:
+                endpoint_vars.append(var)
+        if not vtype:
+            return endpoint_vars
+
+        vtype_vars = []
+        for var in endpoint_vars:
+            if var.vtype == vtype:
+                vtype_vars.append(var)
+        return vtype_vars
+
+    def overview(self):
+        """
+        get the overview string about all params on all parameter servers.
+
+        Returns:
+            Str: overview string.
+
+        """
+        vars_str = []
+        for var in self.distributed_vars:
+            vars_str.append(str(var))
+        return "\n".join(vars_str)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ea5a4cf7cdb3ef91a02bb88d9b859da1ecd1ed0b..a3293afbbd7cef8470c808e98ae88a05f2e492f4 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -30,19 +30,23 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """
 
+import sys
 import math
-import numpy as np
+from functools import reduce
+
 import collections
+import six
 import logging
 
+import numpy as np
+
 from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
-    default_startup_program, Block, \
-    Parameter, grad_var_name
-from .details import *
+    default_startup_program, Block, Parameter, grad_var_name
+from .details import wait_server_ready, UnionFind, VarStruct, VarsDistributed
+from .details import delete_ops, find_op_by_output_arg
 from ..distribute_lookup_table import find_distributed_lookup_table
-from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -327,6 +331,7 @@ class DistributeTranspiler(object):
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
+        self.vars_overview = VarsDistributed()
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
@@ -347,6 +352,7 @@ class DistributeTranspiler(object):
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._ps_endpoint = current_endpoint
         self.origin_program._is_chief = self.trainer_id == 0
         self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
@@ -454,6 +460,10 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
+            distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                recv_vars[i].name)
+            distributed_var.endpoint = ep
+
         # step4: Concat the parameters splits together after recv.
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
@@ -480,6 +490,12 @@ class DistributeTranspiler(object):
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
             if param_varname in self.sparse_param_to_height_sections:
+
+                for table_name in table_names:
+                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                        table_name)
+                    distributed_var.vtype = "RemotePrefetch"
+
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
                 self._update_remote_sparse_update_op(
@@ -532,6 +548,9 @@ class DistributeTranspiler(object):
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
+        self._get_distributed_optimizer_vars()
+        self.origin_program._parameters_on_pservers = self.vars_overview
+
     def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
@@ -541,6 +560,7 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+
         lr_ops = self._get_lr_ops()
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
@@ -665,9 +685,14 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
+        sys.stderr.write(
+            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
+        )
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
+        pserver_program._copy_dist_param_info_from(self.origin_program)
+
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -703,9 +728,6 @@ class DistributeTranspiler(object):
             else:
                 recv_inputs.append(single_trainer_var)
 
-        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
-            endpoint)
-
         # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
@@ -882,10 +904,6 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        # add distributed attrs
-        pserver_program._slice_vars_and_attrs = list(
-            self._slice_params_and_optimizes.values())
-
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
         self.pserver_program = pserver_program
@@ -984,30 +1002,88 @@ class DistributeTranspiler(object):
                     inputs={"X": startup_param_var},
                     outputs={"Out": startup_tmpvar})
 
-        # add slice vars
-        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
-
         return s_prog
 
-    def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = {}
+    # ====================== private transpiler functions =====================
+    def _get_slice_var_info(self, slice_var):
         block_suffix = "block"
-        for param in self.param_grad_ep_mapping[endpoint]["params"]:
-            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
-            if not block_name:
-                continue
+        block_idx = 0
+        offset = 0
+        is_slice = False
 
-            block_idx = int(block_name.split(block_suffix)[1])
-            orig_var = self.origin_program.global_block().vars[orig_var_name]
+        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)
 
-            skip_dim0 = 0
-            slice_vars = self.param_var_mapping[orig_var_name]
-            for slice_var in slice_vars[:block_idx]:
-                skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
-        return slice_vars_and_attrs
+        if not block_name:
+            return is_slice, block_idx, offset
 
-    # ====================== private transpiler functions =====================
+        block_idx = int(block_name.split(block_suffix)[1])
+        skip_dim0 = 0
+        slice_vars = self.param_var_mapping[orig_var_name]
+
+        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+
+        for slice_var in slice_vars[:block_idx]:
+            skip_dim0 += slice_var.shape[0]
+
+        offset = skip_dim0 * orig_dim1_flatten
+        is_slice = True
+        return is_slice, block_idx, offset
+
+    def _get_distributed_optimizer_vars(self):
+        def _get_distributed_optimizer_var(endpoint):
+            opt_op_on_pserver = []
+            for _, op in enumerate(self.optimize_ops):
+                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                        endpoint, op):
+                    opt_op_on_pserver.append(op)
+
+            for opt_op in opt_op_on_pserver:
+                dist_var = None
+                for key in opt_op.input_names:
+                    if key == "Param":
+                        param_name = opt_op.input(key)[0]
+                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
+                            param_name, endpoint)
+                        break
+                for key in opt_op.input_names:
+                    if key in ["Param", "Grad", "LearningRate"]:
+                        continue
+                    origin_var = self.origin_program.global_block().vars[
+                        opt_op.input(key)[0]]
+                    # update accumulator variable shape
+                    new_shape = self._get_optimizer_input_shape(
+                        opt_op.type, key, origin_var.shape,
+                        dist_var.slice.shape)
+
+                    if new_shape == dist_var.slice.shape:
+                        splited_var = VarStruct(
+                            name=origin_var.name,
+                            shape=new_shape,
+                            dtype=origin_var.dtype,
+                            type=origin_var.type,
+                            lod_level=origin_var.lod_level,
+                            persistable=origin_var.persistable)
+
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=splited_var,
+                            is_slice=dist_var.is_slice,
+                            block_id=dist_var.block_id,
+                            offset=dist_var.offset,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+                    else:
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=origin_var,
+                            is_slice=False,
+                            block_id=0,
+                            offset=0,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+
+        for ep in self.pserver_endpoints:
+            _get_distributed_optimizer_var(ep)
 
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
@@ -1093,6 +1169,22 @@ class DistributeTranspiler(object):
         # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+
+        for orig_name, splited_vars in self.param_var_mapping.items():
+            orig_var = self.origin_program.global_block().var(orig_name)
+
+            for splited_var in splited_vars:
+                is_slice, block_id, offset = self._get_slice_var_info(
+                    splited_var)
+
+                self.vars_overview.add_distributed_var(
+                    origin_var=orig_var,
+                    slice_var=splited_var,
+                    block_id=block_id,
+                    offset=offset,
+                    is_slice=is_slice,
+                    vtype="Param")
+
         # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
@@ -1729,13 +1821,6 @@ class DistributeTranspiler(object):
                 shape=new_shape)
             new_inputs[key] = tmpvar
 
-            # var shape been changed
-            if new_shape != var.shape:
-                slice_var_args = self._slice_params_and_optimizes[
-                    param_var.name]
-                self._slice_params_and_optimizes[
-                    var.name] = [var, slice_var_args[1], tmpvar]
-
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
@@ -1763,8 +1848,8 @@ class DistributeTranspiler(object):
                 # skip per trainer vars
                 if g.name.find(".trainer_") == -1:
                     # only param or grads have splited blocks
-                    if self._orig_varname(g.name) in self.grad_name_to_param_name or\
-                        self._orig_varname(g.name) in self.param_name_to_grad_name:
+                    if self._orig_varname(g.name) in self.grad_name_to_param_name or \
+                            self._orig_varname(g.name) in self.param_name_to_grad_name:
                         grad_block = g
                         break
         return grad_block
diff --git a/python/setup.py.in b/python/setup.py.in
index fb4b273a0676fcbcb4402eaf54ddf73d37a2754f..f93f0cd130e33311bade2b15726c3eff37546214 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,8 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.int8_inference',
+          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.core',
           'paddle.fluid.contrib.slim.graph',